public void CrawlerTest() { CookieContainer cookieContainer = new CookieContainer(); HttpWebRequest request = (HttpWebRequest)WebRequest.Create(@"https://bbs.sjtu.edu.cn/bbslogin?id=guest"); request.ProtocolVersion = HttpVersion.Version10; request.ContentType = "application/x-www-form-urlencoded"; request.UserAgent = @"Mozilla/5.0"; HttpWebResponse response = (HttpWebResponse)(request.GetResponse()); cookieContainer.Add(response.Cookies); string url = @"https://bbs.sjtu.edu.cn/bbsdoc?board=PPPerson"; var model = new XMLEntityModel(@".//tr[position() > 1]"); model.AddXMLColumn("ID", @"./td[1]"); WebCrawlerSourceNode crawler = new WebCrawlerSourceNode(new string[] { url }, model, cookieContainer, @"bbs.sjtu.edu.cn"); PipelineTask.Create(crawler) .AddMonitor( (entity) => { Console.WriteLine(entity); }) .Start(); }
public void JsonBasic() { XMLEntityModel model = new XMLEntityModel(@".//Results"); model.AddXMLColumn("Name", "./Name"); model.AddXMLColumn("Desc", "./Desc"); PipelineTask.FromJsonFile("Course", model) .ToTextFile(Output) .Start(); Assert.IsTrue(TestHelper.CompareTwoFile(SampleJsonOutput, Output)); }
public void XmlBasic() { XMLEntityModel model = new XMLEntityModel(@".//Entity"); model.AddXMLColumn("col1", "./col1"); model.AddXMLColumn("col2", "./col2"); PipelineTask.FromXmlFile(XmlSource, model) .ToTextFile(Output) .Start(); Assert.IsTrue(TestHelper.CompareTwoFile(SampleXMLOutput, Output)); }
public static IEnumerable <Entity> GetEntitiesFromContent(string content, XMLEntityModel Model) { HtmlDocument document = new HtmlDocument(); document.LoadHtml(content); HtmlNode rootNode = document.DocumentNode; var nodes = rootNode.SelectNodes(Model.ItemXPath); if (nodes != null) { foreach (var node in nodes) { var entity = new Entity(); foreach (string column in Model.ColumnXPath.Keys) { string xpath = Model.ColumnXPath[column]; // For htmlAgilityPack do not support select attribute string attributeName; GetAttributeNameFromXPath(ref xpath, out attributeName); var columnNode = node.SelectSingleNode(xpath); if (columnNode != null) { if (attributeName == null) { string resultValue = columnNode.InnerText; entity.SetValue(column, resultValue); } else { entity.SetValue(column, columnNode.Attributes[attributeName].Value); } } } if (!entity.IsEmpty()) { yield return(entity); } } } }
public void HtmlBasic() { XMLEntityModel model = new XMLEntityModel(@"//table[@class='wikitable sortable']/tr[not(@*)]"); model.AddXMLColumn("GB", "./td[1]"); model.AddXMLColumn("Province", "./td[3]"); PipelineTask.FromWeb("http://en.wikipedia.org/wiki/China_provinces") .ParseHtml(model) .AddMonitor((entity) => { Console.WriteLine(); }) .ToTextFile(Output, model) .Start(); TestHelper.CompareTwoFile(Province, Output); }
public ParseHtmlProcessNode(DataNode parent, string targetColumn, XMLEntityModel model) : base(parent) { Model = model; TargetColumn = targetColumn; }