protected override Task Execute(CancellationToken token) { log.LogInformation("Downloading message..."); var downloadMessages = File.ReadLines(config.Ids).Select(long.Parse).ToArray(); log.LogInformation("Total messages to download: {0}", downloadMessages.Length); var cred = auth.Authenticate(); var extractor = new MessageCleanup(); var monitor = new PerformanceMonitor(downloadMessages.Length); using (var streamWriter = new StreamWriter(config.Out, false, new UTF8Encoding(false))) using (var csvDataTarget = new CsvWriter(streamWriter)) { csvDataTarget.WriteField("Id"); csvDataTarget.WriteField("Date"); csvDataTarget.WriteField("Author"); csvDataTarget.WriteField("Message"); csvDataTarget.NextRecord(); Auth.ExecuteOperationWithCredentials( cred, () => { using (Observable.Interval(TimeSpan.FromSeconds(30)).Subscribe(item => log.LogInformation(monitor.ToString()))) { downloader.Download(downloadMessages) .ToObservable() .Select( item => { try { csvDataTarget.WriteField(item.Id); csvDataTarget.WriteField(item.CreatedAt); csvDataTarget.WriteField(item.CreatedBy.Id); var text = item.Text; if (config.Clean) { text = extractor.Cleanup(text); } csvDataTarget.WriteField(text); csvDataTarget.NextRecord(); monitor.Increment(); } catch (Exception e) { log.LogError(e, "Error"); } return(item); }) .LastOrDefaultAsync() .Wait(); } }); } return(Task.CompletedTask); }
public IServiceCollection ConfigureServices(IServiceCollection services) { services.AddSingleton <IWordsDictionary, BasicEnglishDictionary>(); services.AddSingleton <INRCDictionary>(ctx => { var dictionary = new NRCDictionary(); dictionary.Load(); return(dictionary); }); services.AddSingleton <ISentenceTokenizerFactory, SentenceTokenizerFactory>(); services.AddSingleton <IPOSTagger, NaivePOSTagger>(); services.AddSingleton <BNCList>(); services.AddSingleton <IPosTagResolver>(ctx => ctx.GetService <BNCList>()); services.AddSingleton <IWordFrequencyList>(ctx => ctx.GetService <BNCList>()); services.AddSingleton(ctx => WordTypeResolver.Instance); services.AddSingleton <IMessageCleanup>(ctx => { var item = new MessageCleanup(); item.CleanCashTags = false; item.LowerCase = false; return(item); }); services.AddSingleton <IRawTextExtractor, RawWordExtractor>(); services.AddSingleton <IMemoryCache>(ctx => new MemoryCache(new MemoryCacheOptions())); return(services); }
protected override Task Execute(CancellationToken token) { log.LogInformation("Starting twitter monitoring..."); SetupWords(); RateLimit.RateLimitTrackerMode = RateLimitTrackerMode.TrackAndAwait; var cleanup = new MessageCleanup(); var monitor = new PerformanceMonitor(100000); var cred = auth.Authenticate(); using (Observable.Interval(TimeSpan.FromSeconds(30)).Subscribe(item => log.LogInformation(monitor.ToString()))) using (var streamWriter = new StreamWriter(config.Out, true, new UTF8Encoding(false))) using (var csvDataTarget = new CsvWriter(streamWriter)) { Auth.ExecuteOperationWithCredentials( cred, () => { var enrichments = Enrichment().ToArray(); foreach (var enrichment in enrichments) { enrichment.Discovery.BatchSize = 5; enrichment.Discovery.AddProcessed(enrichments.SelectMany(p => p.Discovery.Processed).ToArray()); enrichment.Discovery.Process() .ToObservable() .ObserveOn(TaskPoolScheduler.Default) .Select( x => { var text = cleanup.Cleanup(x.Message.Text).Replace("\r\n", " "); if (!CanInclude(text, enrichment.Type)) { return(x); } text = Regex.Replace(text, @"[^\u0000-\u007F]+", string.Empty); csvDataTarget.WriteField(x.Message.Id); csvDataTarget.WriteField(x.Topic); csvDataTarget.WriteField(enrichment.Type); csvDataTarget.WriteField(text); csvDataTarget.NextRecord(); streamWriter.Flush(); monitor.Increment(); return(x); }) .Wait(); } }); } return(Task.CompletedTask); }
public void Setup() { instance = CreateMessageCleanup(); }