public void CrawlerTest() { CookieContainer cookieContainer = new CookieContainer(); HttpWebRequest request = (HttpWebRequest)WebRequest.Create(@"https://bbs.sjtu.edu.cn/bbslogin?id=guest"); request.ProtocolVersion = HttpVersion.Version10; request.ContentType = "application/x-www-form-urlencoded"; request.UserAgent = @"Mozilla/5.0"; HttpWebResponse response = (HttpWebResponse)(request.GetResponse()); cookieContainer.Add(response.Cookies); string url = @"https://bbs.sjtu.edu.cn/bbsdoc?board=PPPerson"; var model = new XMLEntityModel(@".//tr[position() > 1]"); model.AddXMLColumn("ID", @"./td[1]"); WebCrawlerSourceNode crawler = new WebCrawlerSourceNode(new string[] { url }, model, cookieContainer, @"bbs.sjtu.edu.cn"); PipelineTask.Create(crawler) .AddMonitor( (entity) => { Console.WriteLine(entity); }) .Start(); }
/// <summary> /// Deletes a file /// The method takes and returns the same object (with additional data) for easy method chaining. /// WARNING: In debug mode nothing happens! /// </summary> /// <param name="task">The pipeline task witht the input data</param> /// <returns>The pipeline task with the input and the output data</returns> public PipelineTask DoWork(PipelineTask task) { if (task == null) { Logger.Error("Null task received"); throw new ArgumentNullException("task"); } #if DEBUG //Do not delete files, some test may depend on them #else DeleteFile(task.OriginalFilePath); #endif task.OriginalFilePath = string.Empty; #if DEBUG //Do not delete files, some test may depend on them #else DeleteFile(task.ActualFilePath); #endif task.ActualFilePath = string.Empty; Logger.Info("DeleterPostProcessor.DoWork finshed"); return(task); }
/// <summary> /// Parses the text and returns an opinion by counting the positive words, the negative words and comparing them /// using a very basic algorithm. /// The method takes and returns the same object (with additional data) for easy method chaining. /// </summary> /// <param name="task">The pipeline task witht the input data</param> /// <returns>The pipeline task with the input and the output data</returns> public PipelineTask DoWork(PipelineTask task) { if (task == null) { Logger.Error("Null task received"); throw new ArgumentNullException("task"); } var score = GetOpinionOfText(task.ExtractedText); switch (score) { case OpinionType.Negative: task.AnalyzerResult = "Negative Opinion"; break; case OpinionType.Neutral: task.AnalyzerResult = "Neutral Opinion"; break; case OpinionType.Unknown: task.AnalyzerResult = "Unknown Opinion"; break; case OpinionType.Positive: task.AnalyzerResult = "Positive Opinion"; break; } Logger.Info("OpinionAnalyzer.DoWork finshed, result: {0}", task.AnalyzerResult); return(task); }
public void CsvBasic() { PipelineTask.FromCsvFile(SampleSource) .ToTextFile(Output) .Start(); Assert.IsTrue(TestHelper.CompareTwoFile(SampleSource, Output)); }
public void SpiltByT() { PipelineTask.Create(new SingleLineFileSourceNode(SimpleSourceT)) .Spilt(Entity.DefaultColumn, separator: "\t") .To(new TextFileConsumer(SimpleFileOutput)) .Start(); Assert.IsTrue(TestHelper.CompareTwoFile(SimpleSource, SimpleFileOutput)); }
public void FromFileToTemplateFile() { PipelineTask.Create(new SingleLineFileSourceNode(SimpleSourceT)) .Spilt(Entity.DefaultColumn, separator: "\t") .ToTemplateFile(TemplateFileOutput, "##col1## dddd ##col2##") .Start(); Assert.IsTrue(TestHelper.CompareTwoFile(SimpleSource, SimpleFileOutput)); }
public void AddTemplateColumn() { PipelineTask.Create(new SingleLineFileSourceNode(SimpleSourceT)) .Spilt(Entity.DefaultColumn, separator: "\t") .AddTemplateColumn("Template", "##col1## ##col2") .ToTextFile(TemplateFileOutput) .Start(); Assert.IsTrue(TestHelper.CompareTwoFile(SampleTemplateFileOutput, TemplateFileOutput)); }
public SsisPipelineTask(PipelineTask objETL, SSISEmitterContext context) : base(objETL, context) { _logicalETL = objETL; _guid = Guid.NewGuid(); // TODO: Do this for everything _message = MessageEngine.Create(String.Format(System.Globalization.CultureInfo.InvariantCulture, "__SSIS2008Emitter:SSISDataFlow {0}", _guid.ToString())); _componentList = new List <SsisComponent>(); }
public void JsonBasic() { XMLEntityModel model = new XMLEntityModel(@".//Results"); model.AddXMLColumn("Name", "./Name"); model.AddXMLColumn("Desc", "./Desc"); PipelineTask.FromJsonFile("Course", model) .ToTextFile(Output) .Start(); Assert.IsTrue(TestHelper.CompareTwoFile(SampleJsonOutput, Output)); }
public void XmlBasic() { XMLEntityModel model = new XMLEntityModel(@".//Entity"); model.AddXMLColumn("col1", "./col1"); model.AddXMLColumn("col2", "./col2"); PipelineTask.FromXmlFile(XmlSource, model) .ToTextFile(Output) .Start(); Assert.IsTrue(TestHelper.CompareTwoFile(SampleXMLOutput, Output)); }
public void MonitorConsumer() { int count = 0; PipelineTask.Create(new SingleLineFileSourceNode(SimpleSource)) .AddMonitor((sender, args) => { count++; }) .Start(); Assert.AreEqual(2, count); }
public void FileSerilization() { PipelineTask.Create(new SingleLineFileSourceNode(SimpleSource)) .Spilt(Entity.DefaultColumn) .ToFile(SimpleFileOutput) .Start(); PipelineTask.Create(new FileSourceNode(SimpleFileOutput)) .ToTextFile(SimpleSourceT) .Start(); Assert.IsTrue(TestHelper.CompareTwoFile(SimpleSource, SimpleSourceT)); }
/// <summary> /// Extracts the text of a txt or pdf file. /// The method takes and returns the same object (with additional data) for easy method chaining. /// </summary> /// <param name="task">The pipeline task witht the input data</param> /// <returns>The pipeline task with the input and the output data</returns> public PipelineTask DoWork(PipelineTask task) { if (task == null) { Logger.Error("Null task received"); throw new ArgumentNullException("task"); } task.ExtractedText = this.ExtractText(task.ActualFilePath); Logger.Info("BaseTextExtractor.DoWork finshed"); return(task); }
/// <summary> /// Unzipps a zip archive that contains exactly one file /// The method takes and returns the same object (with additional data) for easy method chaining. /// </summary> /// <param name="task">The pipeline task witht the input data</param> /// <returns>The pipeline task with the input and the output data</returns> public PipelineTask DoWork(PipelineTask task) { if (task == null) { Logger.Error("Null task received"); throw new ArgumentNullException("task"); } task.ActualFilePath = UnzipFile(task.OriginalFilePath); Logger.Info("UnzipperPreProcessor.DoWork finshed"); return(task); }
public void WebSource() { int count = 0; string content = null; PipelineTask.Create(new WebSourceNode(@"http://www.bing.com")) .AddMonitor((sender, args) => { count++; content = args.CurrentEntity.GetValue <string>(Entity.DefaultColumn); }) .Start(); Assert.AreEqual(1, count); Assert.IsNotNull(content); }
public void Extend() { PipelineTask.Create(new SingleLineFileSourceNode(SimpleSource)) .Spilt(Entity.DefaultColumn) .AddMonitor((sender, args) => { Assert.AreEqual(2, args.CurrentEntity.Values.Keys.Count); }) .Extend((entity) => { entity.SetValue("a", ""); }) .AddMonitor((sender, args) => { Assert.AreEqual(3, args.CurrentEntity.Values.Keys.Count); }) .Start(); }
public void HtmlBasic() { XMLEntityModel model = new XMLEntityModel(@"//table[@class='wikitable sortable']/tr[not(@*)]"); model.AddXMLColumn("GB", "./td[1]"); model.AddXMLColumn("Province", "./td[3]"); PipelineTask.FromWeb("http://en.wikipedia.org/wiki/China_provinces") .ParseHtml(model) .AddMonitor((entity) => { Console.WriteLine(); }) .ToTextFile(Output, model) .Start(); TestHelper.CompareTwoFile(Province, Output); }
public void Convert() { PipelineTask.Create(new SingleLineFileSourceNode(SimpleSource)) .Spilt(Entity.DefaultColumn) .AddMonitor((sender, args) => { Assert.AreEqual(2, args.CurrentEntity.Values.Keys.Count); }) .Convert((entity) => { return(new Entity()); }) .AddMonitor((sender, args) => { Assert.AreEqual(0, args.CurrentEntity.Values.Keys.Count); }) .Start(); }
static async System.Threading.Tasks.Task Main(string[] args) { string pid = ""; if (args[0] == "-pid") { pid = args[1]; } else { Console.WriteLine("Please, start program with pipeline id:\nPipelineConsole.exe -pid YOUR_PIPELINE_ID_HERE"); Environment.Exit(0); } string connectionString = ConfigurationManager.ConnectionStrings["MongoDb"].ConnectionString; MongoClient client = new MongoClient(connectionString); IMongoDatabase database = client.GetDatabase("test"); var collection = database.GetCollection <BsonDocument>("pipelines"); var taskCollection = database.GetCollection <BsonDocument>("tasks"); int pipelineRunTime = 0; var builder = Builders <BsonDocument> .Filter; var filter = builder.Eq("_id", ObjectId.Parse(pid)); BsonDocument bson = await collection.Find(filter).FirstOrDefaultAsync(); Pipeline pipeline = BsonSerializer.Deserialize <Pipeline>(bson); foreach (var taskId in pipeline.TaskIds) { var taskFilterBuilder = Builders <BsonDocument> .Filter; var taskFilter = taskFilterBuilder.Eq("_id", taskId); BsonDocument bsonTask = await taskCollection.Find(taskFilter).FirstOrDefaultAsync(); PipelineTask task = BsonSerializer.Deserialize <PipelineTask>(bsonTask); pipeline.Tasks.Add(task); pipelineRunTime += task.AverageTime; } Console.WriteLine(pipelineRunTime + " seconds"); Environment.Exit(0); }
public void LoadFromSql() { int count = 0; PipelineTask.FromSql("select * from datasetinfo", new ConnectInfo() { Db = "DatasetInfo", Server = ".", IsTrust = true }) .AddMonitor((entity) => { count++; Assert.IsFalse(entity.IsEmpty()); }) .ToCsvFile("CSVOutput") .Start(); Assert.IsTrue(count == 71); }
public void AzureTableTest() { PipelineTask.FromCsvFile("SimpleAzureSource") .AddMonitor((entity) => { Console.WriteLine(); }) .ToAzureTable(info, "##col1##", "##col1####col2##") .Start(); int count = 0; PipelineTask.Create(new AzureTableSourceNode(info)) .AddMonitor((entity) => { count++; }) .Start(); Assert.AreEqual(5, count); }
public void Filter() { int beforeFilter = 0, afterFilter = 0; PipelineTask.Create(new SingleLineFileSourceNode(SimpleSource)) .Spilt(Entity.DefaultColumn) .AddMonitor((sender, args) => { beforeFilter++; }) .Filter((entity) => { return(true); }) .AddMonitor((sender, args) => { afterFilter++; }) .Start(); Assert.AreEqual(1, beforeFilter); Assert.AreEqual(0, afterFilter); }
public PipelineTaskAttribute(PipelineTask coreTask) { PipelineTask = coreTask; }
public static PipelineTask ToAzureTable(this PipelineTask pipelineTask, AzureTableInfo azureTableInfo, string partitionKeyTemplate, string rowKeyTemplate, int maxParallelCount = 10) { return(pipelineTask.To(new AzureTableConsumer(azureTableInfo, partitionKeyTemplate, rowKeyTemplate, maxParallelCount))); }
protected PipelineTaskEventAttribute(PipelineTask pipelineTask) { PipelineTask = pipelineTask; }
public BeforePipelineTaskAttribute(PipelineTask pipelineTask) : base(pipelineTask) { }
public AfterPipelineTaskAttribute(PipelineTask pipelineTask) : base(pipelineTask) { }