Пример #1
0
        private void importUrl(PipelineContext ctx, IDatasourceSink sink, IStreamProvider elt)
        {
            ctx.IncrementEmitted();
            TikaAsyncWorker worker   = new TikaAsyncWorker(this, elt);
            String          fileName = elt.FullName;

            sink.HandleValue(ctx, "record/_start", fileName);
            sink.HandleValue(ctx, "record/lastmodutc", worker.LastModifiedUtc);
            sink.HandleValue(ctx, "record/virtualFilename", elt.VirtualName);

            //Check if we need to convert this file
            if ((ctx.ImportFlags & _ImportFlags.ImportFull) == 0) //Not a full import
            {
                if ((ctx.ImportFlags & _ImportFlags.RetryErrors) == 0 && worker.LastModifiedUtc < previousRun)
                {
                    ctx.Skipped++;
                    return;
                }
                ExistState existState = toExistState(sink.HandleValue(ctx, "record/_checkexist", elt));
                if ((existState & (ExistState.ExistSame | ExistState.ExistNewer | ExistState.Exist)) != 0)
                {
                    ctx.Skipped++;
                    return;
                }
            }

            TikaAsyncWorker popped = pushPop(ctx, sink, worker);

            if (popped != null)
            {
                importUrl(ctx, sink, popped);
            }
        }
Пример #2
0
 private TikaAsyncWorker pushPop(PipelineContext ctx, IDatasourceSink sink, TikaAsyncWorker newElt)
 {
     try
     {
         return((TikaAsyncWorker)((newElt == null) ? workerQueue.Pop() : workerQueue.PushAndOptionalPop(newElt)));
     }
     catch (Exception e)
     {
         ctx.HandleException(e);
         return(null);
     }
 }
Пример #3
0
        private void importUrl(PipelineContext ctx, IDatasourceSink sink, TikaAsyncWorker worker)
        {
            String fileName = worker.StreamElt.FullName;

            sink.HandleValue(ctx, "record/_start", fileName);
            sink.HandleValue(ctx, "record/lastmodutc", worker.LastModifiedUtc);
            sink.HandleValue(ctx, "record/virtualFilename", worker.StreamElt.VirtualName);
            sink.HandleValue(ctx, "record/virtualRoot", worker.StreamElt.VirtualRoot);

            try
            {
                var htmlProcessor = worker.HtmlProcessor;
                if (worker.StoredAs != null)
                {
                    sink.HandleValue(ctx, "record/converted_file", worker.StoredAs);
                }

                //Write html properties
                foreach (var kvp in htmlProcessor.Properties)
                {
                    sink.HandleValue(ctx, "record/" + kvp.Key, kvp.Value);
                }

                if (mustEmitSecurity)
                {
                    emitSecurity(ctx, sink, fileName);
                }
                //Add dummy type to recognize the errors
                //if (error)
                //   doc.AddField("content_type", "ConversionError");
                //if (htmlProcessor.IsTextMail)
                sink.HandleValue(ctx, "record/_istextmail", htmlProcessor.IsTextMail);
                sink.HandleValue(ctx, "record/_numparts", htmlProcessor.numParts);
                sink.HandleValue(ctx, "record/_numattachments", htmlProcessor.Attachments.Count);
                foreach (var a in htmlProcessor.Attachments)
                {
                    sink.HandleValue(ctx, "record/_attachment", a);
                }
                sink.HandleValue(ctx, "record/_filesize", worker.FileSize);
                sink.HandleValue(ctx, "record/shortcontent", htmlProcessor.GetAbstract(abstractLength, abstractDelta));

                sink.HandleValue(ctx, "record/head", htmlProcessor.GetInnerHead());
                sink.HandleValue(ctx, "record/content", htmlProcessor.GetInnerBody());

                sink.HandleValue(ctx, "record/_end", fileName);
                sink.HandleValue(ctx, "record", null);
            }
            catch (Exception e)
            {
                ctx.HandleException(e);
            }
        }
Пример #4
0
        public void Import(PipelineContext ctx, IDatasourceSink sink)
        {
            workerQueue = AsyncRequestQueue.Create(maxParallel);
            ctx.ImportLog.Log("TikaDS starting. maxparallel={0}, dbgstore={1}, Q={2}", maxParallel, DbgStoreDir, workerQueue);
            if (maxParallel >= 2 && ServicePointManager.DefaultConnectionLimit < maxParallel)
            {
                ctx.ImportLog.Log("Updating connectionLimit for {0} to {1}", ServicePointManager.DefaultConnectionLimit, maxParallel);
                ServicePointManager.DefaultConnectionLimit = maxParallel;
            }

            ensureTikaServiceStarted(ctx);
            previousRun = ctx.RunAdministrations.GetLastOKRunDateShifted(ctx.DatasourceAdmin);
            ctx.ImportLog.Log("Previous (shifted) run was {0}.", previousRun);
            //GenericStreamProvider.DumpRoots(ctx, streamDirectory);
            try
            {
                if (this.mustEmitSecurity)
                {
                    securityCache = new SecurityCache(TikaSecurityAccount.FactoryImpl);
                }
                foreach (var elt in streamDirectory.GetProviders(ctx))
                {
                    try
                    {
                        importUrl(ctx, sink, elt);
                    }
                    catch (Exception e)
                    {
                        throw new BMException(e, "{0}\r\nUrl={1}.", e.Message, elt);
                    }
                }
                //Handle still queued workers
                while (true)
                {
                    TikaAsyncWorker popped = pushPop(ctx, sink, null);
                    if (popped == null)
                    {
                        break;
                    }
                    importUrl(ctx, sink, popped);
                }
            }
            finally
            {
                workerQueue.PopAllWithoutException();
                Utils.FreeAndNil(ref securityCache);
            }
        }