public void TestToString() { var e = HtmlDoc.Parse(test2); Assert.AreEqual(test2Formated, e.ToString()); Assert.AreEqual(test2, e.ToString(false)); }
/// <summary> /// Learns a program to extract the surname from a given table row (rather than a whole document). /// </summary> public static void LearnSurnameWithRespectToTableRow() { string s = File.ReadAllText(@"..\..\SampleDocuments\sample-document-1.html"); HtmlDoc doc = HtmlDoc.Create(s); WebRegion referenceRegion1 = doc.GetRegion("tr:nth-child(1)"); //1st table row WebRegion exampleRegion1 = doc.GetRegion("tr:nth-child(1) td:nth-child(2)"); //2nd cell in 1st table row WebRegion referenceRegion2 = doc.GetRegion("tr:nth-child(2)"); //2nd table row WebRegion exampleRegion2 = doc.GetRegion("tr:nth-child(2) td:nth-child(2)"); //2nd cell in 2nd table row ExtractionExample <WebRegion> exampleSpec1 = new ExtractionExample <WebRegion>(referenceRegion1, exampleRegion1); ExtractionExample <WebRegion> exampleSpec2 = new ExtractionExample <WebRegion>(referenceRegion2, exampleRegion2); Web.Program prog = Web.Learner.Instance.LearnRegion(new[] { exampleSpec1, exampleSpec2 }, Enumerable.Empty <ExtractionExample <WebRegion> >()); if (prog != null) { //run the program on 5th table row WebRegion fifthRowRegion = doc.GetRegion("tr:nth-child(5)"); //5th table row IEnumerable <WebRegion> executionResult = prog.Run(fifthRowRegion); foreach (WebRegion region in executionResult) { Console.WriteLine("Learn surname with respect to table row: "); Console.WriteLine(region.GetSpecificSelector()); Console.WriteLine(region.Text()); Console.WriteLine(); } } }
private void DownloadHtmlContent3(String _content) { HtmlDoc.LoadHtml(_content); HtmlNode node = HtmlDoc.DocumentNode.SelectSingleNode("//head//title"); PageTitle = (node != null ? node.InnerText : HtmlDoc.DocumentNode.InnerText).Trim(); }
/// <summary> /// Learns a program to extract the surname from a given table row (rather than a whole document). /// </summary> public static void LearnSurnameWithRespectToTableRow() { string s = File.ReadAllText(Path.Combine(_sampleDocs, "sample-document-1.html")); HtmlDoc doc = HtmlDoc.Create(s); WebRegion referenceRegion1 = doc.GetRegion("tr:nth-child(1)"); //1st table row WebRegion exampleRegion1 = doc.GetRegion("tr:nth-child(1) td:nth-child(2)"); //2nd cell in 1st table row WebRegion referenceRegion2 = doc.GetRegion("tr:nth-child(2)"); //2nd table row WebRegion exampleRegion2 = doc.GetRegion("tr:nth-child(2) td:nth-child(2)"); //2nd cell in 2nd table row CorrespondingMemberEquals <WebRegion, WebRegion> exampleSpec1 = new CorrespondingMemberEquals <WebRegion, WebRegion>(referenceRegion1, exampleRegion1); CorrespondingMemberEquals <WebRegion, WebRegion> exampleSpec2 = new CorrespondingMemberEquals <WebRegion, WebRegion>(referenceRegion2, exampleRegion2); Web.RegionProgram prog = Web.RegionLearner.Instance.Learn(new[] { exampleSpec1, exampleSpec2 }); if (prog == null) { return; } //run the program on 5th table row WebRegion fifthRowRegion = doc.GetRegion("tr:nth-child(5)"); //5th table row WebRegion region = prog.Run(new [] { fifthRowRegion })?.SingleOrDefault(); Console.WriteLine("Learn surname with respect to table row: "); Console.WriteLine(region.GetSpecificSelector()); Console.WriteLine(region.Text()); Console.WriteLine(); }
private DownloadedNodes(HtmlDoc doc) : this(new[] { doc.FirstElement }) { if (doc.IsEmpty) //no nodes in root { _nodes = new HtmlElement[0]; } }
/// <summary> /// 格式化习题 /// </summary> /// <param name="selecter"></param> /// <returns></returns> public void formatQuestion(string color, string title, int index, int size) { this.RemoveAttr("p", "style", "class"); this.RemoveAttr("span", "style", "class"); var body = HtmlDoc.QuerySelector("body"); if (body == null) { body = HtmlDoc.CreateElement("body"); body.InnerHtml = HtmlDoc.DocumentNode.InnerHtml; HtmlDoc.DocumentNode.RemoveAll(); HtmlDoc.DocumentNode.AppendChild(body); } var css = new StringBuilder(); css.AppendLine("body {" + $"color:{color};font-size:30px;font-weight:bold;line-height:1.5;font-family:'微软雅黑';" + "}"); css.AppendLine("p {margin:0 0 20px 0;}"); css.AppendLine("p:first-child {margin-bottom:50px;}"); var style = HtmlDoc.CreateElement("style"); style.InnerHtml = css.ToString(); var head = HtmlDoc.QuerySelector("head"); if (head == null) { head = HtmlDoc.CreateElement("head"); HtmlDoc.DocumentNode.InsertBefore(head, body); } head.AppendChild(style); body.InnerHtml = $"<p><span style=\"color:#226cfb\">{index}</span>/{size}.{title}</p>" + body.InnerHtml; }
/// <summary> /// Learns a program to extract the first surname in the document from two examples /// from two different documents. /// </summary> public static void LearnFirstSurnameInDocumentUsingMultipleExamples() { string s1 = File.ReadAllText(Path.Combine(_sampleDocs, "sample-document-1.html")); HtmlDoc doc1 = HtmlDoc.Create(s1); string s2 = File.ReadAllText(Path.Combine(_sampleDocs, "sample-document-2.html")); HtmlDoc doc2 = HtmlDoc.Create(s2); WebRegion referenceRegion1 = new WebRegion(doc1); WebRegion referenceRegion2 = new WebRegion(doc2); WebRegion exampleRegion1 = doc1.GetRegion("tr:nth-child(1) td:nth-child(2)"); //2nd cell in 1st table row of doc1 WebRegion exampleRegion2 = doc2.GetRegion("tr:nth-child(1) td:nth-child(2)"); //2nd cell in 1st table row of doc2 CorrespondingMemberEquals <WebRegion, WebRegion> exampleSpec1 = new CorrespondingMemberEquals <WebRegion, WebRegion>(referenceRegion1, exampleRegion1); CorrespondingMemberEquals <WebRegion, WebRegion> exampleSpec2 = new CorrespondingMemberEquals <WebRegion, WebRegion>(referenceRegion2, exampleRegion2); Web.RegionProgram prog = Web.RegionLearner.Instance.Learn(new[] { exampleSpec1, exampleSpec2 }); if (prog == null) { return; } //run the program on the second document WebRegion region = prog.Run(new [] { referenceRegion2 })?.SingleOrDefault(); Console.WriteLine("Learn first surname in document from multiple examples: "); Console.WriteLine(region.GetSpecificSelector()); Console.WriteLine(region.Text()); Console.WriteLine(); }
/// <summary> /// Learns a program and then serializes and deserializes it. /// </summary> public static void SerializeProgram() { string s = File.ReadAllText(Path.Combine(_sampleDocs, "sample-document-1.html")); HtmlDoc doc = HtmlDoc.Create(s); WebRegion referenceRegion = new WebRegion(doc); WebRegion exampleRegion = doc.GetRegion("tr:nth-child(1) td:nth-child(2)"); //2nd cell in 1st table row CorrespondingMemberEquals <WebRegion, WebRegion> exampleSpec = new CorrespondingMemberEquals <WebRegion, WebRegion>(referenceRegion, exampleRegion); Web.RegionProgram prog = Web.RegionLearner.Instance.Learn(new[] { exampleSpec }); if (prog == null) { return; } string progText = prog.Serialize(); Web.RegionProgram loadProg = Web.Loader.Instance.Region.Load(progText); IEnumerable <WebRegion> executionResult = loadProg.Run(new[] { referenceRegion }); Console.WriteLine("Run first surname extraction program after serialization and deserialization: "); foreach (WebRegion region in executionResult) { Console.WriteLine(region.GetSpecificSelector()); Console.WriteLine(region.Text()); } Console.WriteLine(); }
public void TestAttributes() { var e = HtmlDoc.Parse(test3); Assert.AreEqual("Content-Type", e.Attributes["http-equiv"]); Assert.AreEqual("text/html; charset=utf-8", e.Attributes["content"]); }
/// <summary> /// Learns a program to extract the first surname in the document from two examples /// from two different documents. /// </summary> public static void LearnFirstSurnameInDocumentUsingMultipleExamples() { string s1 = File.ReadAllText(@"..\..\SampleDocuments\sample-document-1.html"); HtmlDoc doc1 = HtmlDoc.Create(s1); string s2 = File.ReadAllText(@"..\..\SampleDocuments\sample-document-2.html"); HtmlDoc doc2 = HtmlDoc.Create(s2); WebRegion referenceRegion1 = new WebRegion(doc1); WebRegion referenceRegion2 = new WebRegion(doc2); WebRegion exampleRegion1 = doc1.GetRegion("tr:nth-child(1) td:nth-child(2)"); //2nd cell in 1st table row of doc1 WebRegion exampleRegion2 = doc2.GetRegion("tr:nth-child(1) td:nth-child(2)"); //2nd cell in 1st table row of doc2 ExtractionExample <WebRegion> exampleSpec1 = new ExtractionExample <WebRegion>(referenceRegion1, exampleRegion1); ExtractionExample <WebRegion> exampleSpec2 = new ExtractionExample <WebRegion>(referenceRegion2, exampleRegion2); Web.Program prog = Web.Learner.Instance.LearnRegion(new[] { exampleSpec1, exampleSpec2 }, Enumerable.Empty <ExtractionExample <WebRegion> >()); if (prog != null) { //run the program on the second document IEnumerable <WebRegion> executionResult = prog.Run(referenceRegion2); foreach (WebRegion region in executionResult) { Console.WriteLine("Learn first surname in document from multiple examples: "); Console.WriteLine(region.GetSpecificSelector()); Console.WriteLine(region.Text()); Console.WriteLine(); } } }
public void TestDescendants() { var e = HtmlDoc.Parse(test1); Assert.IsTrue(e.Descendants().Count() > 1); Assert.AreEqual(6, e.Element("ul").Descendants("li").Count()); }
private async void VisitUrlAsync() { if (!enableRefresh) { return; } if (hasInit == false) { return; } URLHTML = await MainFrm.RunBusyWork(() => { HttpStatusCode code; RequestManager.Instance.RequestCount++; return(GetHtml(URL, out code)); }); if (URLHTML.Contains("尝试自动重定向") && MessageBox.Show("网站提示: " + URLHTML + "\n 通常原因是网站对请求合法性做了检查, 建议填写关键字对网页内容进行自动嗅探", "提示信息", MessageBoxButton.OK) == MessageBoxResult.OK) { return; } ControlExtended.SafeInvoke(() => { HtmlDoc.LoadHtml(URLHTML); if (MainDescription.IsUIForm) { var dock = MainFrm as IDockableManager ?? ControlExtended.DockableManager; var control = dock?.ViewDictionary.FirstOrDefault(d => d.Model == this); if (control != null) { dynamic invoke = control.View; if (IsSuperMode == false) { invoke.UpdateHtml(URLHTML); OnPropertyChanged("HtmlDoc"); } else { invoke.UpdateHtml("超级模式下内置浏览器不展示内容,请查看左侧的文本内容"); } } } }, name: "解析html文档"); if (string.IsNullOrWhiteSpace(selectText) == false) { currentXPaths = HtmlDoc.SearchXPath(SelectText, () => IsAttribute).GetEnumerator(); GetXPathAsync(); } OnPropertyChanged("URLHTML"); }
private async void VisitUrlAsync() { if (!enableRefresh) { return; } if (hasInit == false) { return; } URLHTML = await MainFrm.RunBusyWork(() => { HttpStatusCode code; ConfigFile.GetConfig <DataMiningConfig>().RequestCount++; return(GetHtml(URL, out code)); }, title : GlobalHelper.Get("long_visit_web")); if (URLHTML.Contains(GlobalHelper.Get("key_671")) && MessageBox.Show(GlobalHelper.Get("key_672") + URLHTML + GlobalHelper.Get("key_673"), GlobalHelper.Get("key_99"), MessageBoxButton.OK) == MessageBoxResult.OK) { return; } ControlExtended.SafeInvoke(() => { HtmlDoc.LoadHtml(URLHTML); if (MainDescription.IsUIForm) { var dock = MainFrm as IDockableManager ?? ControlExtended.DockableManager; var control = dock?.ViewDictionary.FirstOrDefault(d => d.Model == this); if (control != null) { dynamic invoke = control.View; if (IsSuperMode == false) { invoke.UpdateHtml(URLHTML); OnPropertyChanged("HtmlDoc"); } else { invoke.UpdateHtml(GlobalHelper.Get("key_674")); } } } }, name: GlobalHelper.Get("key_675")); if (string.IsNullOrWhiteSpace(selectText) == false) { currentXPaths = HtmlDoc.SearchXPath(SelectText, () => IsAttribute).GetEnumerator(); GetXPathAsync(); } OnPropertyChanged("URLHTML"); }
//[TestMethod] public void MyTestMethod() { FileStream file = new FileStream("Data/simple.htm", FileMode.Open); var e = new HtmlDoc(file).RootElement; var s = e.ToString(); Assert.AreEqual("", s); }
public void TestMismatch() { var e = HtmlDoc.Parse(test4); Assert.AreEqual(6, e.Descendants().Count()); e = HtmlDoc.Parse(test44).Element().Element(); Assert.AreEqual("ul", e.Name); //Assert.AreEqual<int>(3, e.Descendants().Count()); ///TODO: look up html tags //Assert.AreEqual<string>("ul", e.Descendants().Last().Name); }
public void TestElement() { var doc = new HtmlDoc(stream).RootElement; HtmlElement node; node = doc.Element("h2"); Assert.IsNull(node); node = doc.Element("hello"); Assert.AreEqual("hello", node.Name); node = node.Element("h2"); Assert.AreEqual("h2", node.Name); }
private void Search() { if (string.IsNullOrWhiteSpace(selectText) == false) { var xpaths = HtmlDoc.SearchXPath(SelectText, () => true).ToList(); CrawlItems.Clear(); xpaths.Execute(d => CrawlItems.Add(new CrawlItem { XPath = d, SampleData1 = HtmlDoc.DocumentNode.SelectSingleNodePlus(d, SelectorFormat.XPath).InnerText })); } }
private void VerifyCssSelector(string selector, Semantic.LineInfo lineInfo) { if (string.IsNullOrEmpty(selector)) { return; } HtmlDoc doc = Config.DomFactory.Create(); bool valid = doc.ValidateCss(selector); if (!valid) { Errors.Add(new BadCssSelector(selector, lineInfo)); } }
private void BuildContent(PostItem post, S1PostItem item) { post.Message = post.Message ?? ""; //work around post.Message = post.Message.Replace("<imgwidth=", "<img width=").Replace("\n", ""); FillAttachment(post); var content = new HtmlDoc(string.Format("<div>{0}</div>", S1Resource.HttpUtility.HtmlDecode(post.Message))) .RootElement; if (content != null) item.AddRange(SimpleParser.SimpleThreadParser.ReGroupContent(content)); }
public void TestFindElement() { FileStream file = new FileStream("Data/simple.htm", FileMode.Open); var doc = new HtmlDoc(file).RootElement; var tables = from table in doc.FindElements("table") where table.Attributes["width"] == "98%" where table.Attributes["cellpadding"] == "7" select table; Assert.AreEqual(1, tables.Count()); tables = from table in doc.FindElements() where table.Name == "table" select table; Assert.AreEqual(3, tables.Count()); }
private IDockContent GetContentFromPersistString(string persistString) { if (persistString == typeof(HelpAndExplainWindow).ToString()) { return(m_helpExplorer); } else if (persistString == typeof(PropertyWindow).ToString()) { return(m_propertyWindow); } else if (persistString == typeof(ToolWindow).ToString()) { return(m_toolbox); } else { string[] parsedStrings = persistString.Split(new char[] { ',' }); if (parsedStrings.Length != 3) { return(null); } if (parsedStrings[0] != typeof(HtmlDoc).ToString()) { return(null); } // assure that we deal with just one instance only m_docWindow = EditorDocument; if (parsedStrings[1] != string.Empty) { m_docWindow.FileName = parsedStrings[1]; } if (parsedStrings[2] != string.Empty) { m_docWindow.Text = parsedStrings[2]; } if (String.IsNullOrEmpty(m_docWindow.FileName) || m_docWindow.Text.Equals("New Document")) { ShowCommonHelp(); } return(m_docWindow); } }
private void GenerateXml() { FileStream sourcePage = new FileStream("Data/face.htm", FileMode.Open); FileStream rankPage = new FileStream("Data/faceRank.htm", FileMode.Open); EmotionParser.Init(sourcePage); XDocument doc = new XDocument(); XElement root = new XElement("Root"); doc.Add(root); var ranks = new HtmlDoc(rankPage).RootElement.Descendants("img"); foreach (var image in ranks) { XElement e = null; foreach (var item in EmotionParser.EmotionList) { if (image.Attributes["src"] == item.Value.Path) { e = new XElement("img"); XAttribute id = new XAttribute("Id", item.Value.Id); XAttribute Path = new XAttribute("Path", item.Value.Path); e.Add(id); e.Add(Path); System.Diagnostics.Debug.WriteLine(item.Value.Path); break; } } if (e != null) { root.Add(e); } } var writer = XmlWriter.Create("emotion_list.xml", new XmlWriterSettings { Indent = true, NewLineOnAttributes = false }); doc.WriteTo(writer); writer.Flush(); WelcomeTitle = "Done"; }
private void BuildContent(PostItem post, S1PostItem item) { post.Message = post.Message ?? ""; //work around post.Message = post.Message.Replace("<imgwidth=", "<img width=").Replace("\n", ""); FillAttachment(post); var content = new HtmlDoc(string.Format("<div>{0}</div>", S1Resource.HttpUtility.HtmlDecode(post.Message))) .RootElement; if (content != null) { item.AddRange(SimpleParser.SimpleThreadParser.ReGroupContent(content)); } }
/// <summary> /// 移出标签 /// </summary> /// <param name="tag">标签</param> /// <returns></returns> public HtmlDocHelper RemoveNode(string tag) { if (HtmlDoc == null) { return(this); } if (tag.IsEmpty()) { throw new Exception("请指定参数."); } var nodes = HtmlDoc.QuerySelectorAll(tag); foreach (var item in nodes) { item.Remove(); } return(this); }
/// <summary> /// 为指定的标签添加属性 /// </summary> /// <param name="tag"></param> /// <param name="attr"></param> /// <param name="attrValue"></param> /// <returns></returns> public HtmlDocHelper AddAttr(string tag, string attr, string attrValue) { if (HtmlDoc == null) { return(this); } if (tag.IsEmpty() || attr.IsEmpty()) { throw new Exception("请指定参数."); } var nodes = HtmlDoc.QuerySelectorAll(tag); foreach (var item in nodes) { item.Attributes.Add(attr, attrValue); } return(this); }
private static HtmlDoc GetDocument(IHttpRequestFactory factory, IHttpWire wire, out int length) { var request = CreateRequest(factory, wire); var bytes = request.Download() as byte[]; if (bytes == null) { bytes = new byte[0]; } string html = string.Empty; length = bytes.Length; html = Encoding.UTF8.GetString(bytes, 0, bytes.Length); HtmlDoc doc = Config.DomFactory.Create(); doc.Load(html); return(doc); }
public static async Task <string> GetVerifyString(this S1WebClient client) { string verify = ""; //use DownloadString will just return cached data, which is not what i want //post dummy data to disable cache var privacyPage = await client.PostDataTaskAsync(new Uri(UserAction.PrivacyUrl)); var root = new HtmlDoc(privacyPage).RootElement; var input = root.FindFirst("input", (e) => e.Attributes["name"] == "verify"); if (input != null) { verify = input.Attributes["value"]; } else { throw new S1UserException(ErrorParser.Parse(root)); } return(verify); }
/// <summary> /// Learns a program to extract the surname from a given table row (rather than a whole document) /// using a negative example. /// </summary> public static void LearnSurnameWithRespectToTableRowUsingNegativeExample() { string s = File.ReadAllText(Path.Combine(_sampleDocs, "sample-document-1.html")); HtmlDoc doc = HtmlDoc.Create(s); WebRegion referenceRegion1 = doc.GetRegion("tr:nth-child(1)"); //1st table row WebRegion referenceRegion2 = doc.GetRegion("tr:nth-child(2)"); //2nd table row var posExampleSpec = new CorrespondingMemberEquals <WebRegion, WebRegion>(referenceRegion1, doc.GetRegion("tr:nth-child(1) td:nth-child(2)")); var negExampleSpec = new CorrespondingMemberDoesNotEqual <WebRegion, WebRegion>(referenceRegion2, doc.GetRegion("tr:nth-child(2) td:nth-child(1)")); Web.RegionProgram prog = Web.RegionLearner.Instance.Learn(new Constraint <IEnumerable <WebRegion>, IEnumerable <WebRegion> >[] { posExampleSpec, negExampleSpec }); if (prog == null) { return; } WebRegion region = prog.Run(new [] { referenceRegion1 })?.SingleOrDefault(); Console.WriteLine("Learn surname with respect to table row using negative example: "); Console.WriteLine(region.GetSpecificSelector()); Console.WriteLine(region.Text()); Console.WriteLine(); }
/// <summary> /// Learns a program to extract the first surname in the document from one example. /// </summary> public static void LearnFirstSurnameInDocumentUsingOneExample() { string s = File.ReadAllText(Path.Combine(_sampleDocs, "sample-document-1.html")); HtmlDoc doc = HtmlDoc.Create(s); WebRegion referenceRegion = new WebRegion(doc); WebRegion exampleRegion = doc.GetRegion("tr:nth-child(1) td:nth-child(2)"); //2nd cell in 1st table row CorrespondingMemberEquals <WebRegion, WebRegion> exampleSpec = new CorrespondingMemberEquals <WebRegion, WebRegion>(referenceRegion, exampleRegion); Web.RegionProgram prog = Web.RegionLearner.Instance.Learn(new[] { exampleSpec }); if (prog == null) { return; } //run the program to extract first surname from the document WebRegion region = prog.Run(new [] { referenceRegion })?.SingleOrDefault(); Console.WriteLine("Learn first surname in document from one example: "); Console.WriteLine(region.GetSpecificSelector()); Console.WriteLine(region.Text()); Console.WriteLine(); }
public async void TestServer() { Status = "Connecting"; try{ client = new S1WebClient(); var result = await client.DownloadStringTaskAsync(Addr + path); Status = "Wrong Data"; if (result.Length > 0) { var root = new HtmlDoc(result).RootElement; var serverDownTitle = ServerListViewModel.ServerDownTitle; if (serverDownTitle != null && root.FindFirst("title").InnerHtml.Contains(serverDownTitle)) { Status = "Server Down"; if (NotifySuccess != null) { NotifySuccess(); } } else { if (NotifySuccess != null) { NotifySuccess(); } Status = "Success"; } } } catch (TaskCanceledException) { Status = "Cancled"; } catch (Exception) { Status = "Failed"; } }
/// <summary> /// 移出指定的标签的属性 /// </summary> /// <param name="tag">标签名</param> /// <param name="attrs">属性</param> /// <returns></returns> public HtmlDocHelper RemoveAttr(string tag, params string[] attrs) { if (HtmlDoc == null) { return(this); } if (tag.IsEmpty() || attrs.Length == 0) { throw new Exception("请指定参数."); } var nodes = HtmlDoc.QuerySelectorAll(tag); foreach (var item in nodes) { foreach (var attr in attrs) { item.Attributes.Remove(attr); } } return(this); }
static void ProcessHandles() { var users = new List<UserStatus>(); foreach (string handle in handles) { var request = WebRequest.Create(string.Format(URL, handle)); var stream = request.GetResponse().GetResponseStream(); string line; using (var reader = new StreamReader(stream)) { line = reader.ReadToEnd(); } users.Add(new UserStatus(handle, line, time)); Thread.Sleep(500); } var htmlDoc = new HtmlDoc(); htmlDoc.AddUsersToTable(users); htmlDoc.WriteHtmlDoc(); }