/// <summary> /// Converts a UfDataNode structure into a very basic form HTML. /// </summary> /// <param name="node">Node</param> /// <param name="formatDescriber">Microformat format describer object</param> /// <returns>HTML string</returns> public string Convert(UfDataNode node, UfFormatDescriber formatDescriber) { string output = string.Empty; StringWriter stringWriter = new StringWriter(); UfElementDescriber elementDescriber = formatDescriber.BaseElement; using (XhtmlTextWriter writer = new XhtmlTextWriter(stringWriter)) { writer.WriteBeginTag("div"); writer.WriteAttribute("class", "microformats"); writer.Write(HtmlTextWriter.TagRightChar); foreach (UfDataNode child in node.Nodes) { writer.WriteLine(); AddNode(child, elementDescriber, writer); } writer.WriteEndTag("div"); writer.WriteLine(); } return stringWriter.ToString(); }
/// <summary> /// Load and parse a Html string. /// </summary> /// <param name="htmlString">Html string</param> /// <param name="url">A Url for relative path operations</param> /// <param name="formatDescriber">The microformat format describer</param> public void Load(string htmlString, string url, UfFormatDescriber formatDescriber) { // Temp fix xhtml strict issue htmlString = htmlString.Replace("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">", ""); htmlString = htmlString.Replace("<meta content=\"text/html; charset => utf-8\" http-equiv=\"Content-Type\" />", ""); HtmlDocument document = new HtmlDocument(); document.LoadHtml(htmlString); this.Load(document, url, formatDescriber); }
/// <summary> /// Load and parse a Html document. /// </summary> /// <param name="document">HtmlAgilityPack Htmldocument object</param> /// <param name="url">The source Url of the document</param> /// <param name="formatDescriber">The microformat format describer</param> public void Load(HtmlDocument document, string url, UfFormatDescriber formatDescriber) { if (document == null) throw new ArgumentNullException("document"); this.url = url; this.formatDescriber = formatDescriber; this.document = document; // Add in the whole html string from the page into the top data node data.OuterHtml = this.document.DocumentNode.OuterHtml; HtmlNodeCollection nodes; this.baseUrl = FindDocumentNodeAttributeValue("//html", "xml:base"); this.baseUrl = FindDocumentNodeAttributeValue("//body", "xml:base"); this.baseUrl = FindDocumentNodeAttributeValue("//base", "href"); // Find the html page title nodes = this.document.DocumentNode.SelectNodes("//title"); if (nodes != null) foreach (HtmlNode node in nodes) this.htmlPageTitle = node.InnerText; // Start with document node this.startNode = document.DocumentNode; //// Find any fragment select //// <a name="profile"> html nodes </a> if (url != "") { Uri uri = new Uri(url); string frag = uri.Fragment; if (frag != string.Empty) { try { // A name based fragment selection nodes = this.document.DocumentNode.SelectNodes("//a[@name='" + frag.Replace("#", "") + "']"); if (nodes != null) { this.startNode = nodes[0]; } else { // ID based fragment selection nodes = this.document.DocumentNode.SelectNodes("//*[@id='" + frag.Replace("#", "") + "']"); this.startNode = nodes[0]; } } catch (Exception ex) { throw new Exception("Could not find name fragment" + frag); } } } // Starts recursion ParseUfElement(this.startNode, this.FormatDescriber.BaseElement, this.Data, true); UfHelpers.RunNodeOptimization(this.Data); }
//----------------------------------------------------------------------- /// <summary> /// Load and parse a Html string. /// </summary> /// <param name="htmlString">Html string</param> /// <param name="url">A Url for relative path operations</param> /// <param name="formatDescriber">The microformat format describer</param> public void Load(string htmlString, UfFormatDescriber formatDescriber) { Load(htmlString, "", formatDescriber); }
/// <summary> /// Load and parse a Html document. /// </summary> /// <param name="document">HtmlAgilityPack Htmldocument object</param> /// <param name="formatDescriber">The microformat format describer</param> public void Load(HtmlDocument document, UfFormatDescriber formatDescriber) { Load(document, "", formatDescriber); }
protected void Page_Load(object sender, EventArgs e) { string url = ""; string formatString = ""; UfFormatDescriber formatDescriber = null;; if (Request.QueryString["format"] != null) { formatString = Request.QueryString["format"]; } if (Request.QueryString["url"] != null) { url = Request.QueryString["url"]; } switch (formatString) { case "hcard": formatDescriber = UfFormats.HCard(); break; case "hcalendar": formatDescriber = UfFormats.HCalendar(); break; case "hreview": formatDescriber = UfFormats.HReview(); break; case "hresume": formatDescriber = UfFormats.HResume(); break; case "hatom": formatDescriber = UfFormats.HAtomItem(); break; case "xfn": formatDescriber = UfFormats.Xfn(); break; case "tag": formatDescriber = UfFormats.Tag(); break; case "geo": formatDescriber = UfFormats.Geo(); break; case "adr": formatDescriber = UfFormats.Adr(); break; case "no-follow": formatDescriber = UfFormats.NoFollow(); break; case "license": formatDescriber = UfFormats.License(); break; case "votelinks": formatDescriber = UfFormats.VoteLinks(); break; case "hcard-xfn": formatDescriber = UfFormats.HCardXFN(); break; case "me": formatDescriber = UfFormats.Me(); break; case "nextprevious": formatDescriber = UfFormats.NextPrevious(); break; case "test-suite": formatDescriber = UfFormats.TestSuite(); break; case "test-fixture": formatDescriber = UfFormats.TestFixture(); break; } if (formatDescriber != null && url != "") { UfWebRequest webRequest = new UfWebRequest(); webRequest.Load(url, formatDescriber); if (webRequest.Data.Nodes.Count > 0) { UfDataToJson dataConvertor = new UfDataToJson(); Response.ContentType = "application/json"; Response.Write(dataConvertor.Convert(webRequest.Data, formatDescriber)); } } }
/// <summary> /// Loads a single Html pages and does a microformat parse /// </summary> /// <param name="url">The Url of the webpage to be pasred</param> /// <param name="formatDescriber">A format describer for microformat to be parsed</param> public void Load(string url, UfFormatDescriber formatDescriber) { _formatDescriber = formatDescriber; try { if (url != string.Empty) { // Check for issues with url url = url.Trim(); url = HttpUtility.UrlDecode(url); UfWebPage webPage = LoadHtmlDoc(url); if (webPage != null) { Url urlReport = new Url { Address = webPage.Url, Status = webPage.StatusCode }; _parsedUrls.Add(urlReport); DateTime started = DateTime.Now; if (webPage.StatusCode == 200 && webPage.Html != null) ParseUf(webPage.Html, url, formatDescriber, false, urlReport); if (webPage.StatusCode != 200) throw (new Exception("Could not load url: " + url + " " + webPage.StatusCode)); DateTime ended = DateTime.Now; urlReport.LoadTime = ended.Subtract(started); Urls.Add(urlReport); } } else { throw new Exception("No Url given"); } } catch (Exception ex) { if (ex.Message == string.Empty) { throw new Exception("Could not load Url: " + url); } throw; } }
// Parse uf private void ParseUf(HtmlDocument htmlDoc, string url, UfFormatDescriber format, bool multiples, Url urlReport) { UfParse ufparse = new UfParse(); ufparse.Load(htmlDoc, url, format); if (multiples) _data.Nodes.Add(ufparse.Data); else _data = ufparse.Data; urlReport.HtmlPageTitle = ufparse.HtmlPageTitle; }
/// <summary> /// Converts a UfDataNode structure into JSON /// </summary> /// <param name="node">Node</param> /// <param name="formatDescriber">Microformat format describer object</param> /// <param name="callBack">JSONP callback function name to wrap JSON object</param> /// <returns>JSON string</returns> public string Convert(UfDataNode node, UfFormatDescriber formatDescriber, string callBack) { this.callBack = callBack; this.callBack = this.callBack.Replace("(", "").Replace(")", "").Trim(); return Convert(node, formatDescriber); }
/// <summary> /// Converts a UfDataNode structure into JSON /// </summary> /// <param name="node">Node</param> /// <param name="formatDescriber">Microformat format describer object</param> /// <returns>JSON string</returns> public string Convert(UfDataNode node, UfFormatDescriber formatDescriber) { foreach (UfDataNode childNode in node.Nodes) { if (childNode.Name == formatDescriber.BaseElement.Name) { UfDataNode xChild = tree.Nodes.Append(childNode.Name, childNode.Value, childNode.SourceUrl, childNode.RepresentativeNode); if (childNode.Nodes.Count > 0) AddChildNodes(xChild, childNode, formatDescriber.BaseElement); } } //string output = "// UfXtract \n"; string output = ""; if( callBack != string.Empty) output += callBack + "( "; output += "{\"microformats\": {"; foreach (UfDataNode childNode in tree.Nodes) output += BuildDataString(childNode, true, false); if (tree.Nodes.Count > 0) output = output.Substring(0, output.Length - 2); output += AddUfErrors(); output += AddReporting( node ); // End whole block output += "}}"; if (callBack != string.Empty) output += " )"; return output; }