private void importUrl(PipelineContext ctx, IDatasourceSink sink, IStreamProvider elt) { ctx.IncrementEmitted(); TikaAsyncWorker worker = new TikaAsyncWorker(this, elt); String fileName = elt.FullName; sink.HandleValue(ctx, "record/_start", fileName); sink.HandleValue(ctx, "record/lastmodutc", worker.LastModifiedUtc); sink.HandleValue(ctx, "record/virtualFilename", elt.VirtualName); //Check if we need to convert this file if ((ctx.ImportFlags & _ImportFlags.ImportFull) == 0) //Not a full import { if ((ctx.ImportFlags & _ImportFlags.RetryErrors) == 0 && worker.LastModifiedUtc < previousRun) { ctx.Skipped++; return; } ExistState existState = toExistState(sink.HandleValue(ctx, "record/_checkexist", elt)); if ((existState & (ExistState.ExistSame | ExistState.ExistNewer | ExistState.Exist)) != 0) { ctx.Skipped++; return; } } TikaAsyncWorker popped = pushPop(ctx, sink, worker); if (popped != null) { importUrl(ctx, sink, popped); } }
private TikaAsyncWorker pushPop(PipelineContext ctx, IDatasourceSink sink, TikaAsyncWorker newElt) { try { return((TikaAsyncWorker)((newElt == null) ? workerQueue.Pop() : workerQueue.PushAndOptionalPop(newElt))); } catch (Exception e) { ctx.HandleException(e); return(null); } }
private void importUrl(PipelineContext ctx, IDatasourceSink sink, TikaAsyncWorker worker) { String fileName = worker.StreamElt.FullName; sink.HandleValue(ctx, "record/_start", fileName); sink.HandleValue(ctx, "record/lastmodutc", worker.LastModifiedUtc); sink.HandleValue(ctx, "record/virtualFilename", worker.StreamElt.VirtualName); sink.HandleValue(ctx, "record/virtualRoot", worker.StreamElt.VirtualRoot); try { var htmlProcessor = worker.HtmlProcessor; if (worker.StoredAs != null) { sink.HandleValue(ctx, "record/converted_file", worker.StoredAs); } //Write html properties foreach (var kvp in htmlProcessor.Properties) { sink.HandleValue(ctx, "record/" + kvp.Key, kvp.Value); } if (mustEmitSecurity) { emitSecurity(ctx, sink, fileName); } //Add dummy type to recognize the errors //if (error) // doc.AddField("content_type", "ConversionError"); //if (htmlProcessor.IsTextMail) sink.HandleValue(ctx, "record/_istextmail", htmlProcessor.IsTextMail); sink.HandleValue(ctx, "record/_numparts", htmlProcessor.numParts); sink.HandleValue(ctx, "record/_numattachments", htmlProcessor.Attachments.Count); foreach (var a in htmlProcessor.Attachments) { sink.HandleValue(ctx, "record/_attachment", a); } sink.HandleValue(ctx, "record/_filesize", worker.FileSize); sink.HandleValue(ctx, "record/shortcontent", htmlProcessor.GetAbstract(abstractLength, abstractDelta)); sink.HandleValue(ctx, "record/head", htmlProcessor.GetInnerHead()); sink.HandleValue(ctx, "record/content", htmlProcessor.GetInnerBody()); sink.HandleValue(ctx, "record/_end", fileName); sink.HandleValue(ctx, "record", null); } catch (Exception e) { ctx.HandleException(e); } }
public void Import(PipelineContext ctx, IDatasourceSink sink) { workerQueue = AsyncRequestQueue.Create(maxParallel); ctx.ImportLog.Log("TikaDS starting. maxparallel={0}, dbgstore={1}, Q={2}", maxParallel, DbgStoreDir, workerQueue); if (maxParallel >= 2 && ServicePointManager.DefaultConnectionLimit < maxParallel) { ctx.ImportLog.Log("Updating connectionLimit for {0} to {1}", ServicePointManager.DefaultConnectionLimit, maxParallel); ServicePointManager.DefaultConnectionLimit = maxParallel; } ensureTikaServiceStarted(ctx); previousRun = ctx.RunAdministrations.GetLastOKRunDateShifted(ctx.DatasourceAdmin); ctx.ImportLog.Log("Previous (shifted) run was {0}.", previousRun); //GenericStreamProvider.DumpRoots(ctx, streamDirectory); try { if (this.mustEmitSecurity) { securityCache = new SecurityCache(TikaSecurityAccount.FactoryImpl); } foreach (var elt in streamDirectory.GetProviders(ctx)) { try { importUrl(ctx, sink, elt); } catch (Exception e) { throw new BMException(e, "{0}\r\nUrl={1}.", e.Message, elt); } } //Handle still queued workers while (true) { TikaAsyncWorker popped = pushPop(ctx, sink, null); if (popped == null) { break; } importUrl(ctx, sink, popped); } } finally { workerQueue.PopAllWithoutException(); Utils.FreeAndNil(ref securityCache); } }