Beispiel #1
0
 private TikaAsyncWorker pushPop(PipelineContext ctx, IDatasourceSink sink, TikaAsyncWorker newElt)
 {
     try
     {
         return((TikaAsyncWorker)((newElt == null) ? workerQueue.Pop() : workerQueue.PushAndOptionalPop(newElt)));
     }
     catch (Exception e)
     {
         ctx.HandleException(e);
         return(null);
     }
 }
Beispiel #2
0
        private void importUrl(PipelineContext ctx, IDatasourceSink sink, TikaAsyncWorker worker)
        {
            String fileName = worker.StreamElt.FullName;

            sink.HandleValue(ctx, "record/_start", fileName);
            sink.HandleValue(ctx, "record/lastmodutc", worker.LastModifiedUtc);
            sink.HandleValue(ctx, "record/virtualFilename", worker.StreamElt.VirtualName);
            sink.HandleValue(ctx, "record/virtualRoot", worker.StreamElt.VirtualRoot);

            try
            {
                var htmlProcessor = worker.HtmlProcessor;
                if (worker.StoredAs != null)
                {
                    sink.HandleValue(ctx, "record/converted_file", worker.StoredAs);
                }

                //Write html properties
                foreach (var kvp in htmlProcessor.Properties)
                {
                    sink.HandleValue(ctx, "record/" + kvp.Key, kvp.Value);
                }

                if (mustEmitSecurity)
                {
                    emitSecurity(ctx, sink, fileName);
                }
                //Add dummy type to recognize the errors
                //if (error)
                //   doc.AddField("content_type", "ConversionError");
                //if (htmlProcessor.IsTextMail)
                sink.HandleValue(ctx, "record/_istextmail", htmlProcessor.IsTextMail);
                sink.HandleValue(ctx, "record/_numparts", htmlProcessor.numParts);
                sink.HandleValue(ctx, "record/_numattachments", htmlProcessor.Attachments.Count);
                foreach (var a in htmlProcessor.Attachments)
                {
                    sink.HandleValue(ctx, "record/_attachment", a);
                }
                sink.HandleValue(ctx, "record/_filesize", worker.FileSize);
                sink.HandleValue(ctx, "record/shortcontent", htmlProcessor.GetAbstract(abstractLength, abstractDelta));

                sink.HandleValue(ctx, "record/head", htmlProcessor.GetInnerHead());
                sink.HandleValue(ctx, "record/content", htmlProcessor.GetInnerBody());

                sink.HandleValue(ctx, "record/_end", fileName);
                sink.HandleValue(ctx, "record", null);
            }
            catch (Exception e)
            {
                ctx.HandleException(e);
            }
        }
 public void Import(PipelineContext ctx, IDatasourceSink sink)
 {
     _BeforeImport(ctx, sink);
     try
     {
         foreach (var elt in streamDirectory.GetProviders(ctx))
         {
             try
             {
                 ImportUrl(ctx, sink, elt);
             }
             catch (Exception e)
             {
                 e = new BMException(e, WrapMessage(e, elt.ToString(), "{0}\r\nUrl={1}."));
                 ctx.HandleException(e);
             }
         }
     }
     finally
     {
         _AfterImport(ctx, sink);
     }
 }
        private void importUrl(PipelineContext ctx, IDatasourceSink sink, IStreamProvider elt)
        {
            int  splitUntil    = elt.ContextNode.ReadInt("@splituntil", this.splitUntil);
            bool objectPerLine = elt.ContextNode.ReadBool("@objectperline", this.objectPerLine);

            ctx.SendItemStart(elt);
            if ((ctx.ActionFlags & _ActionFlags.Skip) != 0)
            {
                return;
            }

            ExistState existState = ExistState.NotExist;

            if ((ctx.ImportFlags & _ImportFlags.ImportFull) == 0) //Not a full import
            {
                existState = toExistState(sink.HandleValue(ctx, "record/_checkexist", null));
            }

            //Check if we need to convert this file
            if ((existState & (ExistState.ExistSame | ExistState.ExistNewer | ExistState.Exist)) != 0)
            {
                ctx.Skipped++;
                ctx.ImportLog.Log("Skipped: {0}. Date={1}", elt, 0);// dtFile);
                return;
            }

            List <String> keys   = new List <string>();
            List <String> values = new List <String>();
            Stream        fs     = null;

            try
            {
                fs = elt.CreateStream(ctx);
                if (!this.objectPerLine)
                {
                    importRecord(ctx, sink, fs, splitUntil);
                }
                else
                {
                    byte[]       buf    = new byte[4096];
                    int          offset = 0;
                    MemoryStream tmp    = new MemoryStream();
                    while (true)
                    {
                        int len = offset + fs.Read(buf, offset, buf.Length - offset);
                        if (len == offset)
                        {
                            break;
                        }
                        int i = offset;
                        for (; i < len; i++)
                        {
                            if (buf[i] == '\n')
                            {
                                break;
                            }
                        }

                        tmp.Write(buf, offset, i - offset);
                        if (i == offset)
                        {
                            offset = 0;
                            continue;
                        }


                        if (tmp.Position > 0)
                        {
                            tmp.Position = 0;
                            importRecord(ctx, sink, tmp, splitUntil);
                            tmp.Position = 0;
                        }
                        if (i + 1 < offset)
                        {
                            tmp.Write(buf, i + 1, len - i - 1);
                        }
                    }
                    if (offset > 0)
                    {
                        tmp.Write(buf, 0, offset);
                    }
                    if (tmp.Position > 0)
                    {
                        tmp.Position = 0;
                        importRecord(ctx, sink, tmp, splitUntil);
                    }
                }
                ctx.OptSendItemStop();
            }
            catch (Exception e)
            {
                ctx.HandleException(e);
            }
        }
        protected override void ImportStream(PipelineContext ctx, IDatasourceSink sink, IStreamProvider elt, Stream strm)
        {
            int lineNo = -1;

            try
            {
                TextReader rdr = strm.CreateTextReader(encoding);

                int charsRead = 0;
                if ((mode & _Mode.lines) != 0)
                {
                    while (charsRead < maxToRead)
                    {
                        lineNo++;
                        String line = rdr.ReadLine();
                        if (line == null)
                        {
                            break;
                        }
                        if (line.Length == 0)
                        {
                            if ((mode & _Mode.stopAtEmpty) != 0)
                            {
                                break;
                            }
                        }
                        sink.HandleValue(ctx, "record/line", line);
                        charsRead += line.Length;
                    }
                }
                else
                {
                    lineNo++;
                    String line = rdr.ReadLine();
                    if (line != null)
                    {
                        charsRead += line.Length;
                    }
                    String key, value;
                    while (line != null)
                    {
                        lineNo++;
                        String nextLine = rdr.ReadLine();
                        if (nextLine == null)
                        {
                            key = "record/" + splitKV(line, out value);
                            sink.HandleValue(ctx, key, value);
                            break;
                        }
                        charsRead += nextLine.Length;
                        if (nextLine.Length == 0)
                        {
                            if ((mode & _Mode.stopAtEmpty) != 0)
                            {
                                break;
                            }
                            else
                            {
                                continue;
                            }
                        }

                        int offs = 0;
                        for (; offs < nextLine.Length; offs++)
                        {
                            switch (nextLine[offs])
                            {
                            case ' ':
                            case '\t': continue;
                            }
                            break;
                        }

                        if (offs > 0)
                        {
                            line = line + nextLine.Substring(offs);
                            continue;
                        }

                        if (lenient && nextLine.IndexOf(':') < 0)
                        {
                            line = line + nextLine;
                            continue;
                        }

                        key = "record/" + splitKV(line, out value);
                        sink.HandleValue(ctx, key, value);
                        line = nextLine;
                    }
                }
                sink.HandleValue(ctx, "record", null);
                ctx.IncrementEmitted();
            }
            catch (Exception e)
            {
                e = new BMException(e, "{0}\nLine={1}.", e.Message, lineNo);
                ctx.HandleException(e);
            }
        }
Beispiel #6
0
        private void importUrl(PipelineContext ctx, IDatasourceSink sink, IStreamProvider elt)
        {
            int maxParallel = elt.ContextNode.ReadInt("@maxparallel", this.maxParallel);
            int splitUntil  = elt.ContextNode.ReadInt("@splituntil", this.splitUntil);

            if (splitUntil < 0)
            {
                splitUntil = int.MaxValue;
            }
            bool scan = elt.ContextNode.ReadBool("@scan", this.scan);

            String url = elt.ToString();

            ctx.SendItemStart(elt);
            String  command = elt.ContextNode.ReadStr("@command", null);
            String  index   = command != null ? null : elt.ContextNode.ReadStr("@index"); //mutual exclusive with command
            String  reqBody = elt.ContextNode.ReadStr("request", this.requestBody);
            JObject req     = null;

            if (reqBody != null)
            {
                req = JObject.Parse(reqBody);
            }
            ctx.DebugLog.Log("Request scan={1}, body={0}", reqBody, scan);
            try
            {
                Uri             uri  = new Uri(url);
                ESConnection    conn = ESHelper.CreateConnection(ctx, url);
                ContextCallback cb   = new ContextCallback(ctx, this, elt);
                conn.Timeout          = timeoutInMs; //Same timeout as what we send to ES
                conn.OnPrepareRequest = cb.OnPrepareRequest;
                if (command != null)
                {
                    var resp = conn.SendCmd("POST", command, reqBody);
                    resp.ThrowIfError();
                    Pipeline.EmitToken(ctx, sink, resp.JObject, "response", splitUntil);
                }
                else
                {
                    ESRecordEnum e = new ESRecordEnum(conn, index, req, numRecords, timeout, scan);
                    if (maxParallel > 0)
                    {
                        e.Async = true;
                    }
                    ctx.ImportLog.Log("Starting scan of {0} records. Index={1}, connection={2}, async={3}, buffersize={4} requestbody={5}, splituntil={6}, scan={7}.", e.Count, index, url, e.Async, numRecords, req != null, splitUntil, scan);
                    foreach (var doc in e)
                    {
                        ctx.IncrementEmitted();
                        sink.HandleValue(ctx, "record/_sort", doc.Sort);
                        sink.HandleValue(ctx, "record/_type", doc.Type);
                        if (splitUntil != 0)
                        {
                            foreach (var kvp in doc)
                            {
                                String pfx = "record/" + kvp.Key;
                                if (splitUntil == 1)
                                {
                                    sink.HandleValue(ctx, pfx, kvp.Value);
                                    continue;
                                }
                                Pipeline.EmitToken(ctx, sink, kvp.Value, pfx, splitUntil - 1);
                            }
                        }
                        sink.HandleValue(ctx, "record", doc);
                    }
                }
                ctx.SendItemStop();
            }
            catch (Exception e)
            {
                ctx.HandleException(e);
            }
        }