/// <summary>
        /// Converts a UfDataNode structure into a very basic form HTML.
        /// </summary>
        /// <param name="node">Node</param>
        /// <param name="formatDescriber">Microformat format describer object</param>
        /// <returns>HTML string</returns>
        public string Convert(UfDataNode node, UfFormatDescriber formatDescriber)
        {
            string output = string.Empty;

            StringWriter stringWriter = new StringWriter();
            UfElementDescriber elementDescriber = formatDescriber.BaseElement;

            using (XhtmlTextWriter writer = new XhtmlTextWriter(stringWriter))
            {
                writer.WriteBeginTag("div");
                writer.WriteAttribute("class", "microformats");
                writer.Write(HtmlTextWriter.TagRightChar);

                foreach (UfDataNode child in node.Nodes)
                {
                    writer.WriteLine();
                    AddNode(child, elementDescriber, writer);
                }

                writer.WriteEndTag("div");
                writer.WriteLine();
            }

            return stringWriter.ToString();
        }
Exemple #2
0
        /// <summary>
        /// Load and parse a Html string.
        /// </summary>
        /// <param name="htmlString">Html string</param>
        /// <param name="url">A Url for relative path operations</param>
        /// <param name="formatDescriber">The microformat format describer</param>
        public void Load(string htmlString, string url, UfFormatDescriber formatDescriber)
        {
            // Temp fix xhtml strict issue
            htmlString = htmlString.Replace("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">", "");
            htmlString = htmlString.Replace("<meta content=\"text/html; charset => utf-8\" http-equiv=\"Content-Type\" />", "");

            HtmlDocument document = new HtmlDocument();
            document.LoadHtml(htmlString);
            this.Load(document, url, formatDescriber);
        }
Exemple #3
0
        /// <summary>
        /// Load and parse a Html document.
        /// </summary>
        /// <param name="document">HtmlAgilityPack Htmldocument object</param>
        /// <param name="url">The source Url of the document</param>
        /// <param name="formatDescriber">The microformat format describer</param>
        public void Load(HtmlDocument document, string url, UfFormatDescriber formatDescriber)
        {
            if (document == null)
                throw new ArgumentNullException("document");

            this.url = url;
            this.formatDescriber = formatDescriber;
            this.document = document;

            // Add in the whole html string from the page into the top data node
            data.OuterHtml = this.document.DocumentNode.OuterHtml;

            HtmlNodeCollection nodes;

            this.baseUrl = FindDocumentNodeAttributeValue("//html", "xml:base");
            this.baseUrl = FindDocumentNodeAttributeValue("//body", "xml:base");
            this.baseUrl = FindDocumentNodeAttributeValue("//base", "href");

            // Find the html page title
            nodes = this.document.DocumentNode.SelectNodes("//title");
            if (nodes != null)
                foreach (HtmlNode node in nodes)
                    this.htmlPageTitle = node.InnerText;

            // Start with document node
            this.startNode = document.DocumentNode;

            //// Find any fragment select
            //// <a name="profile"> html nodes </a>
            if (url != "")
            {
                Uri uri = new Uri(url);
                string frag = uri.Fragment;
                if (frag != string.Empty)
                {
                    try
                    {
                        // A name based fragment selection
                        nodes = this.document.DocumentNode.SelectNodes("//a[@name='" + frag.Replace("#", "") + "']");
                        if (nodes != null)
                        {
                            this.startNode = nodes[0];
                        }
                        else
                        {
                            // ID based fragment selection
                            nodes = this.document.DocumentNode.SelectNodes("//*[@id='" + frag.Replace("#", "") + "']");
                            this.startNode = nodes[0];
                        }

                    }
                    catch (Exception ex)
                    {
                        throw new Exception("Could not find name fragment" + frag);
                    }
                }
            }

            // Starts recursion
            ParseUfElement(this.startNode, this.FormatDescriber.BaseElement, this.Data, true);

            UfHelpers.RunNodeOptimization(this.Data);
        }
Exemple #4
0
 //-----------------------------------------------------------------------
 /// <summary>
 /// Load and parse a Html string.
 /// </summary>
 /// <param name="htmlString">Html string</param>
 /// <param name="url">A Url for relative path operations</param>
 /// <param name="formatDescriber">The microformat format describer</param>
 public void Load(string htmlString, UfFormatDescriber formatDescriber)
 {
     Load(htmlString, "", formatDescriber);
 }
Exemple #5
0
 /// <summary>
 /// Load and parse a Html document.
 /// </summary>
 /// <param name="document">HtmlAgilityPack Htmldocument object</param>
 /// <param name="formatDescriber">The microformat format describer</param>
 public void Load(HtmlDocument document, UfFormatDescriber formatDescriber)
 {
     Load(document, "", formatDescriber);
 }
        protected void Page_Load(object sender, EventArgs e)
        {
            string            url             = "";
            string            formatString    = "";
            UfFormatDescriber formatDescriber = null;;

            if (Request.QueryString["format"] != null)
            {
                formatString = Request.QueryString["format"];
            }

            if (Request.QueryString["url"] != null)
            {
                url = Request.QueryString["url"];
            }

            switch (formatString)
            {
            case "hcard":
                formatDescriber = UfFormats.HCard();
                break;

            case "hcalendar":
                formatDescriber = UfFormats.HCalendar();
                break;

            case "hreview":
                formatDescriber = UfFormats.HReview();
                break;

            case "hresume":
                formatDescriber = UfFormats.HResume();
                break;

            case "hatom":
                formatDescriber = UfFormats.HAtomItem();
                break;

            case "xfn":
                formatDescriber = UfFormats.Xfn();
                break;

            case "tag":
                formatDescriber = UfFormats.Tag();
                break;

            case "geo":
                formatDescriber = UfFormats.Geo();
                break;

            case "adr":
                formatDescriber = UfFormats.Adr();
                break;

            case "no-follow":
                formatDescriber = UfFormats.NoFollow();
                break;

            case "license":
                formatDescriber = UfFormats.License();
                break;

            case "votelinks":
                formatDescriber = UfFormats.VoteLinks();
                break;

            case "hcard-xfn":
                formatDescriber = UfFormats.HCardXFN();
                break;

            case "me":
                formatDescriber = UfFormats.Me();
                break;

            case "nextprevious":
                formatDescriber = UfFormats.NextPrevious();
                break;

            case "test-suite":
                formatDescriber = UfFormats.TestSuite();
                break;

            case "test-fixture":
                formatDescriber = UfFormats.TestFixture();
                break;
            }


            if (formatDescriber != null && url != "")
            {
                UfWebRequest webRequest = new UfWebRequest();
                webRequest.Load(url, formatDescriber);

                if (webRequest.Data.Nodes.Count > 0)
                {
                    UfDataToJson dataConvertor = new UfDataToJson();
                    Response.ContentType = "application/json";
                    Response.Write(dataConvertor.Convert(webRequest.Data, formatDescriber));
                }
            }
        }
        /// <summary>
        /// Loads a single Html pages and does a microformat parse
        /// </summary>
        /// <param name="url">The Url of the webpage to be pasred</param>
        /// <param name="formatDescriber">A format describer for microformat to be parsed</param>
        public void Load(string url, UfFormatDescriber formatDescriber)
        {
            _formatDescriber = formatDescriber;

            try {
                if (url != string.Empty) {
                    // Check for issues with url
                    url = url.Trim();
                    url = HttpUtility.UrlDecode(url);

                    UfWebPage webPage = LoadHtmlDoc(url);

                    if (webPage != null)
                    {
                        Url urlReport = new Url
                        {
                            Address = webPage.Url,
                            Status = webPage.StatusCode
                        };

                        _parsedUrls.Add(urlReport);
                        DateTime started = DateTime.Now;

                        if (webPage.StatusCode == 200 && webPage.Html != null)
                            ParseUf(webPage.Html, url, formatDescriber, false, urlReport);

                        if (webPage.StatusCode != 200)
                            throw (new Exception("Could not load url: " + url + " " + webPage.StatusCode));

                        DateTime ended = DateTime.Now;
                        urlReport.LoadTime = ended.Subtract(started);
                        Urls.Add(urlReport);
                    }

                } else {
                    throw new Exception("No Url given");
                }

            } catch (Exception ex) {
                if (ex.Message == string.Empty) {
                    throw new Exception("Could not load Url: " + url);
                }
                throw;
            }
        }
        // Parse uf
        private void ParseUf(HtmlDocument htmlDoc, string url, UfFormatDescriber format, bool multiples, Url urlReport)
        {
            UfParse ufparse = new UfParse();
            ufparse.Load(htmlDoc, url, format);
            if (multiples)
                _data.Nodes.Add(ufparse.Data);
            else
                _data = ufparse.Data;

            urlReport.HtmlPageTitle = ufparse.HtmlPageTitle;
        }
 /// <summary>
 /// Converts a UfDataNode structure into JSON
 /// </summary>
 /// <param name="node">Node</param>
 /// <param name="formatDescriber">Microformat format describer object</param>
 /// <param name="callBack">JSONP callback function name to wrap JSON object</param>
 /// <returns>JSON string</returns>
 public string Convert(UfDataNode node, UfFormatDescriber formatDescriber, string callBack)
 {
     this.callBack = callBack;
     this.callBack = this.callBack.Replace("(", "").Replace(")", "").Trim();
     return Convert(node, formatDescriber);
 }
        /// <summary>
        /// Converts a UfDataNode structure into JSON
        /// </summary>
        /// <param name="node">Node</param>
        /// <param name="formatDescriber">Microformat format describer object</param>
        /// <returns>JSON string</returns>
        public string Convert(UfDataNode node, UfFormatDescriber formatDescriber)
        {
            foreach (UfDataNode childNode in node.Nodes)
            {
                if (childNode.Name == formatDescriber.BaseElement.Name)
                {
                    UfDataNode xChild = tree.Nodes.Append(childNode.Name, childNode.Value, childNode.SourceUrl, childNode.RepresentativeNode);
                    if (childNode.Nodes.Count > 0)
                        AddChildNodes(xChild, childNode, formatDescriber.BaseElement);

                }
            }

            //string output = "// UfXtract \n";
            string output = "";
            if( callBack != string.Empty)
                output += callBack + "( ";

            output += "{\"microformats\": {";

            foreach (UfDataNode childNode in tree.Nodes)
                output += BuildDataString(childNode, true, false);

            if (tree.Nodes.Count > 0)
                output = output.Substring(0, output.Length - 2);

            output += AddUfErrors();
            output += AddReporting( node );

            // End whole block
            output += "}}";

            if (callBack != string.Empty)
                output += " )";

            return  output;
        }