コード例 #1
0
        private IBookmarkItem ParseItem(HTMLparser parser)
        {
            BookmarkLink item = null;
            HTMLchunk    chunk, prevChunk = parser.CurrentChunk;

            while ((chunk = parser.ParseNext()) != null)
            {
                if (chunk.IsOpenTag && chunk.Tag == "a")
                {
                    item = new BookmarkLink();
                    AssignLinkAttributes(item, chunk.oParams);
                    item.Title = GetTextOrDontMove(parser);
                }
                else if (chunk.IsOpenTag && chunk.Tag == "dd" && item != null)
                {
                    item.Description = ParseDescription(parser);
                }
                else if (chunk.IsOpenTag && chunk.Tag == "h3")
                {
                    var folder = new BookmarkFolder();
                    AssignFolderAttributes(folder, chunk.oParams);
                    folder.Title = GetTextOrDontMove(parser);
                    return(folder);
                }
                else if ((chunk.IsOpenTag && chunk.Tag == "dt") || chunk.Tag == "dl")
                {
                    parser.StepBack(prevChunk);
                    break;
                }
                prevChunk = chunk;
            }
            return(item);
        }
コード例 #2
0
        internal static HTMLparser GetInstance()
        {
            HTMLparser oP = new HTMLparser();

            // This is optional, but if you want high performance then you may
            // want to set chunk hash mode to FALSE. This would result in tag params
            // being added to string arrays in HTMLchunk object called sParams and sValues, with number
            // of actual params being in iParams. See code below for details.
            //
            // When TRUE (and its default) tag params will be added to hashtable HTMLchunk (object).oParams
            oP.SetChunkHashMode(false);

            // if you set this to true then original parsed HTML for given chunk will be kept -
            // this will reduce performance somewhat, but may be desireable in some cases where
            // reconstruction of HTML may be necessary
            oP.bKeepRawHTML = true;

            // if set to true (it is false by default), then entities will be decoded: this is essential
            // if you want to get strings that contain final representation of the data in HTML, however
            // you should be aware that if you want to use such strings into output HTML string then you will
            // need to do Entity encoding or same string may fail later
            oP.bDecodeEntities = true;

            // we have option to keep most entities as is - only replace stuff like  
            // this is called Mini Entities mode - it is handy when HTML will need
            // to be re-created after it was parsed, though in this case really
            // entities should not be parsed at all
            oP.bDecodeMiniEntities = true;

            if (!oP.bDecodeEntities && oP.bDecodeMiniEntities)
            {
                oP.InitMiniEntities();
            }

            // if set to true, then in case of Comments and SCRIPT tags the data set to oHTML will be
            // extracted BETWEEN those tags, rather than include complete RAW HTML that includes tags too
            // this only works if auto extraction is enabled
            oP.bAutoExtractBetweenTagsOnly = true;

            // if true then comments will be extracted automatically
            oP.bAutoKeepComments = true;

            // if true then scripts will be extracted automatically:
            oP.bAutoKeepScripts = true;

            // if this option is true then whitespace before start of tag will be compressed to single
            // space character in string: " ", if false then full whitespace before tag will be returned (slower)
            // you may only want to set it to false if you want exact whitespace between tags, otherwise it is just
            // a waste of CPU cycles
            oP.bCompressWhiteSpaceBeforeTag = true;

            // if true (default) then tags with attributes marked as CLOSED (/ at the end) will be automatically
            // forced to be considered as open tags - this is no good for XML parsing, but I keep it for backwards
            // compatibility for my stuff as it makes it easier to avoid checking for same tag which is both closed
            // or open
            oP.bAutoMarkClosedTagsWithParamsAsOpen = false;

            return(oP);
        }
コード例 #3
0
 public HtmlLookup(string doc)
 {
     if (string.IsNullOrEmpty(doc) == false)
     {
         _parser = new HTMLparser();
         _parser.Init(doc);
     }
 }
コード例 #4
0
 public void Dispose()
 {
     if (_parser != null)
     {
         _parser.Close();
         _parser.Dispose();
         _parser = null;
     }
 }
コード例 #5
0
        private string ParseDescription(HTMLparser parser)
        {
            var chunk = parser.ParseNext();

            if (chunk != null && chunk.IsText && !string.IsNullOrWhiteSpace(chunk.HTML))
            {
                return(chunk.HTML.Trim());
            }
            return(null);
        }
コード例 #6
0
        private BookmarkFolder Parse(byte[] content)
        {
            var parser = new HTMLparser(content)
            {
                DecodeEntities = true
            };
            var rootFolder = ParseFolder(parser, null, true);

            return(rootFolder);
        }
コード例 #7
0
        private string GetTextOrDontMove(HTMLparser parser)
        {
            var textChunk = parser.ParseNext();

            if (textChunk.IsText)
            {
                return(textChunk.HTML);
            }
            parser.StepBack(textChunk);
            return(null);
        }
コード例 #8
0
        public void FontSizes()
        {
            // test font size calculation
            Assert.AreEqual(HTMLparser.FontSize.Large, HTMLparser.ParseFontSize("+1", HTMLparser.FontSize.Medium));
            Assert.AreEqual(HTMLparser.FontSize.Small, HTMLparser.ParseFontSize("-1", HTMLparser.FontSize.Medium));

            Assert.AreEqual(HTMLparser.FontSize.Unknown, HTMLparser.ParseFontSize("", HTMLparser.FontSize.Medium));
            Assert.AreEqual(HTMLparser.FontSize.Unknown, HTMLparser.ParseFontSize("ald", HTMLparser.FontSize.Medium));

            Assert.IsFalse(HTMLparser.IsBiggerFont(HTMLparser.FontSize.Small, HTMLparser.FontSize.Large));
            Assert.IsTrue(HTMLparser.IsBiggerFont(HTMLparser.FontSize.Medium, HTMLparser.FontSize.Small));
            Assert.IsTrue(HTMLparser.IsEqualOrBiggerFont(HTMLparser.FontSize.Medium, HTMLparser.FontSize.Medium));
            Assert.IsTrue(HTMLparser.IsEqualOrBiggerFont(HTMLparser.FontSize.Medium, HTMLparser.FontSize.Small));
        }
コード例 #9
0
        public void CreateParser()
        {
            if (oP != null)
            {
                oP.Close();
                oP = null;
            }

            oP = new HTMLparser();

            oP.bDecodeEntities = true;

            // dummy assertion
            Assert.IsNotNull(oP);
        }
コード例 #10
0
        public void Widths()
        {
            // try calculation of widths

            bool bRelative = false;

            Assert.AreEqual(500, HTMLparser.CalculateWidth("500", 1000, ref bRelative));
            Assert.IsFalse(bRelative);

            Assert.AreEqual(1000, HTMLparser.CalculateWidth("", 1000, ref bRelative));

            Assert.AreEqual(1000, HTMLparser.CalculateWidth("10000000000000", 1000, ref bRelative));

            Assert.AreEqual(500, HTMLparser.CalculateWidth("50%", 1000, ref bRelative));
            Assert.IsTrue(bRelative);
        }
コード例 #11
0
ファイル: DictCnParser.cs プロジェクト: colt365/lunar-thu
        public SmartMe.Core.Data.DictResult Parse(string html, Encoding encoding)
        {
            dictResult = new DictResult();
            HTMLparser oP = HtmlParserFactory.GetInstance();

            dictResult.DictionaryType = DictionaryType.Dict_cn;

            oP.Init(encoding.GetBytes(html));
            oP.SetEncoding(encoding);
            HTMLchunk oChunk = null;

            int  state        = 0;
            bool bEncodingSet = false;

            while ((oChunk = oP.ParseNext()) != null)
            {
                switch (oChunk.oType)
                {
                case  HTMLchunkType.OpenTag:
                    HandleOpenTag(oChunk, ref state);

printParams:
                    if (oChunk.sTag == "meta")
                    {
                        HandleMetaEncoding(oP, oChunk, ref bEncodingSet);
                    }
                    ;
                    HandleParam(oChunk, ref state);


                    break;

                case HTMLchunkType.CloseTag:
                    HandleCloseTag(oChunk, ref state);
                    break;

                case HTMLchunkType.Text:
                    HandleText(oChunk, ref state);
                    break;

                default:
                    break;
                }
            }

            return(dictResult);
        }
コード例 #12
0
ファイル: BaiduParser.cs プロジェクト: colt365/lunar-thu
        public SearchEngineResult Parse(string html, Encoding encoding)
        {
            HTMLparser oP = HtmlParserFactory.GetInstance();

            searchResult = new SearchEngineResult();
            searchResult.SearchEngineType = SearchEngineType.Baidu;
            item = new SearchEngineResult.ResultItem();
            //item.Source = "Baidu";
            oP.Init(encoding.GetBytes(html));
            oP.SetEncoding(encoding);
            HTMLchunk oChunk = null;

            int  state        = 0;
            bool bEncodingSet = false;

            while ((oChunk = oP.ParseNext()) != null)
            {
                switch (oChunk.oType)
                {
                case HTMLchunkType.OpenTag:
                    HandleOpenTag(oChunk, ref state);

printParams:
                    if (oChunk.sTag == "meta")
                    {
                        HandleMetaEncoding(oP, oChunk, ref bEncodingSet);
                    }
                    ;
                    HandleParam(oChunk, ref state);


                    break;

                case HTMLchunkType.CloseTag:
                    HandleCloseTag(oChunk, ref state);
                    break;

                case HTMLchunkType.Text:
                    HandleText(oChunk, ref state);
                    break;

                default:
                    break;
                }
            }
            return(searchResult);
        }
コード例 #13
0
        private BookmarkFolder ParseFolder(HTMLparser parser, BookmarkFolder folderBase, bool root = false)
        {
            var folder = folderBase ?? new BookmarkFolder();

            folderBase = null;
            AssignFolderAttributes(folder, folder.Attributes);
            HTMLchunk chunk;

            while ((chunk = parser.ParseNext()) != null)
            {
                if (chunk.Type == HTMLchunkType.OpenTag && chunk.Tag == "dt")
                {
                    var item = ParseItem(parser);
                    if (item != null)
                    {
                        if (item is BookmarkFolder)
                        {
                            folderBase = item as BookmarkFolder;
                        }
                        else
                        {
                            folder.Add(item);
                        }
                    }
                }
                else if (chunk.IsOpenTag && chunk.Tag == "dl")
                {
                    if (root)
                    {
                        folder = ParseFolder(parser, folderBase);
                        root   = false;
                    }
                    else
                    {
                        var newFolder = ParseFolder(parser, folderBase);
                        folder.Add(newFolder);
                    }
                }
                else if (chunk.IsCloseTag && chunk.Tag == "dl")
                {
                    return(folder);
                }
            }
            return(folder);
        }
コード例 #14
0
ファイル: BaiduParser.cs プロジェクト: colt365/lunar-thu
        private void HandleMetaEncoding(HTMLparser oP, HTMLchunk oChunk, ref bool bEncodingSet)
        {
            // if encoding already set then we should not be trying to set new one
            // this is the logic that major browsers follow - the first Encoding is assumed to be
            // the correct one
            if (bEncodingSet)
            {
                return;
            }

            if (HTMLparser.HandleMetaEncoding(oP, oChunk, ref bEncodingSet))
            {
                if (!bEncodingSet)
                {
                    Console.WriteLine("Failed to set encoding from META: {0}", oChunk.GenerateHTML());
                }
            }
        }
コード例 #15
0
ファイル: HtmlSerializer.cs プロジェクト: pnoodles/hfm-net
        private static List <Protein> ParseProteins(string html)
        {
            var htmlParser = new HTMLparser();

            htmlParser.Init(html);
            var list = new List <Protein>();

            HTMLchunk chunk;

            while ((chunk = htmlParser.ParseNext()) != null)
            {
                // Look for an Open "tr" Tag
                if (chunk.oType.Equals(HTMLchunkType.OpenTag) &&
                    chunk.sTag.ToLower() == "tr")
                {
                    var p = new Protein();
                    int projectNumber;
                    if (Int32.TryParse(GetNextTdValue(htmlParser), NumberStyles.Integer, CultureInfo.InvariantCulture, out projectNumber))
                    {
                        p.ProjectNumber = projectNumber;
                    }
                    else
                    {
                        continue;
                    }
                    p.ServerIP      = GetNextTdValue(htmlParser);
                    p.WorkUnitName  = GetNextTdValue(htmlParser);
                    p.NumberOfAtoms = ToInt32OrDefault(GetNextTdValue(htmlParser));
                    p.PreferredDays = ToDoubleOrDefault(GetNextTdValue(htmlParser));
                    p.MaximumDays   = ToDoubleOrDefault(GetNextTdValue(htmlParser));
                    p.Credit        = ToDoubleOrDefault(GetNextTdValue(htmlParser));
                    p.Frames        = ToInt32OrDefault(GetNextTdValue(htmlParser));
                    p.Core          = GetNextTdValue(htmlParser);
                    p.Description   = GetNextTdValue(htmlParser, "href");
                    p.Contact       = GetNextTdValue(htmlParser);
                    p.KFactor       = ToDoubleOrDefault(GetNextTdValue(htmlParser));

                    list.Add(p);
                }
            }

            return(list);
        }
コード例 #16
0
ファイル: UtilityHtmlParser.cs プロジェクト: TRYMYOUT/lookin
        public void Open(Session session)
        {
            String charset = "utf-8";

            Parser = new HTMLparser();

            try
            {
                if (Utility.IsResponseHtml(session) || Utility.IsResponseXml(session))
                {
                    Parser.Init(session.responseBodyBytes == null ? new byte[] { } : session.responseBodyBytes);
                    Parser.bAutoKeepScripts  = true;
                    Parser.bEnableHeuristics = false;

                    // When bAutoExtractBetweenTagsOnly is false, the parser will see attributes
                    // in the script tags, such as <script src="mydata">.  Otherwise it will not.
                    Parser.bAutoExtractBetweenTagsOnly = true;
                }
            }
            catch (Exception e)
            {
                Trace.TraceWarning("Warning: UtilityHtmlParser threw an unhandled exception: {0}", e.Message);
                ExceptionLogger.HandleException(e);
            }


            // Get the encoding name from the HTML or HTTP
            charset = Utility.GetHtmlCharset(session);

            try
            {
                // TODO: check if the encoding is a known good before continuing!!!
                // See if the charset name we got is a valid system encoding name.
                // GetEncoding should throw an Argument ex if not.
                Encoding e = Encoding.GetEncoding(charset);
                Parser.SetEncoding(charset);
            }
            catch (ArgumentException e)
            {
                // Default to utf-8 if
                Parser.SetEncoding(new UTF8Encoding(false, false));
            }
        }
コード例 #17
0
ファイル: HtmlSerializer.cs プロジェクト: pnoodles/hfm-net
        public static string GetNextValue(HTMLparser htmlParser, string tagName, string paramName)
        {
            HTMLchunk oChunk;

            while ((oChunk = htmlParser.ParseNext()) != null)
            {
                // Look for an Open Tag matching the given Tag Name
                if (oChunk.oType.Equals(HTMLchunkType.OpenTag) &&
                    oChunk.sTag.ToLower() == tagName)
                {
                    // If not looking for a Tag Parameter
                    if (paramName.Length == 0)
                    {
                        // Look inside the "td" Tag
                        oChunk = htmlParser.ParseNext();
                        if (oChunk != null)
                        {
                            // If it's an Open "font" Tag
                            if (oChunk.oType.Equals(HTMLchunkType.OpenTag) &&
                                oChunk.sTag.ToLower() == "font")
                            {
                                // Look inside the "font" Tag
                                oChunk = htmlParser.ParseNext();

                                // If it's Text, return it
                                if (oChunk != null &&
                                    oChunk.oType.Equals(HTMLchunkType.Text))
                                {
                                    return(oChunk.oHTML.Trim());
                                }
                            }
                            // If it's Text, return it
                            else if (oChunk.oType.Equals(HTMLchunkType.Text))
                            {
                                return(oChunk.oHTML.Trim());
                            }
                        }
                    }
                    // Looking for a Tag Parameter
                    else
                    {
                        // Look inside the "td" Tag
                        oChunk = htmlParser.ParseNext();

                        // If it's an Open Tag
                        if (oChunk != null &&
                            oChunk.oType.Equals(HTMLchunkType.OpenTag) &&
                            oChunk.oParams.Contains(paramName))
                        {
                            // Return the specified Parameter Name
                            return(oChunk.oParams[paramName].ToString());
                        }
                    }

                    return(String.Empty);
                }
            }

            //throw new InvalidOperationException("Could not complete operation to get td tag value.");
            return(String.Empty);
        }
コード例 #18
0
ファイル: HtmlSerializer.cs プロジェクト: pnoodles/hfm-net
 public static string GetNextThValue(HTMLparser htmlParser)
 {
     return(GetNextValue(htmlParser, "th", String.Empty));
 }
コード例 #19
0
ファイル: HtmlSerializer.cs プロジェクト: pnoodles/hfm-net
 public static string GetNextTdValue(HTMLparser htmlParser, string paramName)
 {
     return(GetNextValue(htmlParser, "td", paramName));
 }
コード例 #20
0
ファイル: HtmlSerializer.cs プロジェクト: pnoodles/hfm-net
 public static string GetNextTdValue(HTMLparser pSummary)
 {
     return(GetNextTdValue(pSummary, String.Empty));
 }
コード例 #21
0
        private bool RequestSstFile()
        {
            try
            {
                DateTime dt       = DateTime.Today.AddDays(-10);
                string   getUrl   = ReplaceMacros(_noaaSstGetUrl, dt);
                string   response = "";

                ServicePointManager.ServerCertificateValidationCallback += ValidateRemoteCertificate;
                ServicePointManager.SecurityProtocol = (SecurityProtocolType)3072; //TLS 1.2

                using (WebClient wc = new WebClient())
                {
                    response = wc.DownloadString(getUrl);
                }

                HTMLparser parser = new HTMLparser();
                parser.Init(response);

                HTMLchunk chunk = null;

                while ((chunk = parser.ParseNext()) != null)
                {
                    if (chunk.sTag != "a")
                    {
                        continue;
                    }
                    if (chunk.oType != HTMLchunkType.OpenTag)
                    {
                        continue;
                    }
                    if (chunk.oParams == null || chunk.oParams.Count < 1)
                    {
                        continue;
                    }

                    string href = chunk.oParams["href"]?.ToString();
                    if (string.IsNullOrEmpty(href))
                    {
                        continue;
                    }
                    if (href.StartsWith("ftp://"))
                    {
                        string file = Path.Combine(SimulationData.WorkFolder, "SST.nc");
                        if (File.Exists(file) == false)
                        {
                            using (WebClient wc = new WebClient())
                            {
                                wc.DownloadFile(href, file);
                            }
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                int s = 0;
            }

            return(false);
        }
コード例 #22
0
        static public IEnumerable <XNode> ConvertNodesToXml(byte[] htmlAsBytes)
        {
            HTMLparser parser = OpenParser();

            parser.Init(htmlAsBytes);

            XElement currentNode = new XElement("document");

            HTMLchunk m12chunk = null;

            int    xmlnsAttributeIndex = 0;
            string originalHtml        = "";

            while ((m12chunk = parser.ParseNext()) != null)
            {
                try
                {
                    Debug.Assert(!m12chunk.bHashMode);  // popular default for Majestic-12 setting

                    XNode    newNode        = null;
                    XElement newNodesParent = null;

                    switch (m12chunk.oType)
                    {
                    case HTMLchunkType.OpenTag:

                        // Tags are added as a child to the current tag,
                        // except when the new tag implies the closure of
                        // some number of ancestor tags.

                        newNode = ParseTagNode(m12chunk, originalHtml, ref xmlnsAttributeIndex);

                        if (newNode != null)
                        {
                            currentNode = FindParentOfNewNode(m12chunk, originalHtml, currentNode);

                            newNodesParent = currentNode;

                            newNodesParent.Add(newNode);

                            currentNode = newNode as XElement;
                        }

                        break;

                    case HTMLchunkType.CloseTag:

                        if (m12chunk.bEndClosure)
                        {
                            newNode = ParseTagNode(m12chunk, originalHtml, ref xmlnsAttributeIndex);

                            if (newNode != null)
                            {
                                currentNode = FindParentOfNewNode(m12chunk, originalHtml, currentNode);

                                newNodesParent = currentNode;
                                newNodesParent.Add(newNode);
                            }
                        }
                        else
                        {
                            XElement nodeToClose = currentNode;

                            string m12chunkCleanedTag = CleanupTagName(m12chunk.sTag, originalHtml);

                            while (nodeToClose != null && nodeToClose.Name.LocalName != m12chunkCleanedTag)
                            {
                                nodeToClose = nodeToClose.Parent;
                            }

                            if (nodeToClose != null)
                            {
                                currentNode = nodeToClose.Parent;
                            }

                            Debug.Assert(currentNode != null);
                        }

                        break;

                    case HTMLchunkType.Script:

                        newNode        = new XElement("script", "REMOVED");
                        newNodesParent = currentNode;
                        newNodesParent.Add(newNode);
                        break;

                    case HTMLchunkType.Comment:

                        newNodesParent = currentNode;

                        if (m12chunk.sTag == "!--")
                        {
                            newNode = new XComment(m12chunk.oHTML);
                        }
                        else if (m12chunk.sTag == "![CDATA[")
                        {
                            newNode = new XCData(m12chunk.oHTML);
                        }
                        else
                        {
                            throw new Exception("Unrecognized comment sTag");
                        }

                        newNodesParent.Add(newNode);

                        break;

                    case HTMLchunkType.Text:

                        currentNode.Add(m12chunk.oHTML);
                        break;

                    default:
                        break;
                    }
                }
                catch (Exception e)
                {
                    var wrappedE = new Exception("Error using Majestic12.HTMLChunk, reason: " + e.Message, e);

                    // the original html is copied for tracing/debugging purposes
                    originalHtml = new string(htmlAsBytes.Skip(m12chunk.iChunkOffset)
                                              .Take(m12chunk.iChunkLength)
                                              .Select(B => (char)B).ToArray());

                    wrappedE.Data.Add("source", originalHtml);

                    throw wrappedE;
                }
            }

            while (currentNode.Parent != null)
            {
                currentNode = currentNode.Parent;
            }

            return(currentNode.Nodes());
        }
コード例 #23
0
        /// <summary>
        /// Download project information from Stanford University (psummaryC.html)
        /// </summary>
        /// <param name="State">Null in this implementation</param>
        public void DownloadFromStanford(Object State /* null */)
        {
            DateTime Start = Debug.ExecStart;

            lock (this)
            {
                Preferences.PreferenceSet Prefs = Preferences.PreferenceSet.Instance;

                WebRequest wrq = (WebRequest)WebRequest.Create("http://vspx27.stanford.edu/psummaryC.html");
                wrq.Method = WebRequestMethods.Http.Get;
                WebResponse  wrs;
                StreamReader sr1;
                if (Prefs.UseProxy)
                {
                    wrq.Proxy = new WebProxy(Prefs.ProxyServer, Prefs.ProxyPort);
                    if (Prefs.UseProxyAuth)
                    {
                        wrq.Proxy.Credentials = new NetworkCredential(Prefs.ProxyUser, Prefs.ProxyPass);
                    }
                }
                else
                {
                    wrq.Proxy = null;
                }

                // TODO: Handle timeouts and errors
                try
                {
                    wrs = (WebResponse)wrq.GetResponse();
                    sr1 = new StreamReader(wrs.GetResponseStream(), Encoding.ASCII);

                    if ((wrs == null) || (sr1 == null))
                    {
                        throw new IOException("The web response or stream was null");
                    }
                }
                catch (WebException ExWeb)
                {
                    ClassLogger.LogException(LogLevel.Warn, String.Format("{0} threw WebException {1}.", Debug.FunctionName, ExWeb.Message), null);
                    ClassLogger.Log(LogLevel.Trace, String.Format("{0} Execution Time: {1}", Debug.FunctionName, Debug.GetExecTime(Start)), "");
                    return;
                }
                catch (IOException ExIO)
                {
                    ClassLogger.LogException(LogLevel.Warn, String.Format("{0} threw IOException {1}.", Debug.FunctionName, ExIO.Message), null);
                    ClassLogger.Log(LogLevel.Trace, String.Format("{0} Execution Time: {1}", Debug.FunctionName, Debug.GetExecTime(Start)), "");
                    return;
                }
                catch (Exception Ex)
                {
                    ClassLogger.LogException(LogLevel.Warn, String.Format("{0} threw WebException {1}.", Debug.FunctionName, Ex.Message), null);
                    ClassLogger.Log(LogLevel.Trace, String.Format("{0} Execution Time: {1}", Debug.FunctionName, Debug.GetExecTime(Start)), "");
                    return;
                }

                HTMLparser pSummary     = new HTMLparser();
                String     sSummaryPage = sr1.ReadToEnd();
                pSummary.Init(sSummaryPage);

                // Locate the table
                HTMLchunk oChunk = null;

                // Parse until returned oChunk is null indicating we reached end of parsing
                while ((oChunk = pSummary.ParseNext()) != null)
                {
                    if (oChunk.sTag.ToLower() == "tr")
                    {
                        Protein p = new Protein();
                        while (((oChunk = pSummary.ParseNext()) != null) && (oChunk.sTag.ToLower() != "td"))
                        {
                            ; // Do nothing!
                        }
                        // Skip the empty attributes
                        oChunk = pSummary.ParseNext();
                        try
                        {
                            #region Parse Code for HTML Table
                            // Suck out the project number
                            p.ProjectNumber = Int32.Parse(oChunk.oHTML.ToString());

                            // Skip the closing tag, opening tags and attributes
                            oChunk     = pSummary.ParseNext();
                            oChunk     = pSummary.ParseNext();
                            oChunk     = pSummary.ParseNext();
                            p.ServerIP = oChunk.oHTML.ToString().Trim();

                            // Skip the closing tag, opening tags and attributes
                            oChunk         = pSummary.ParseNext();
                            oChunk         = pSummary.ParseNext();
                            oChunk         = pSummary.ParseNext();
                            p.WorkUnitName = oChunk.oHTML.ToString().Trim();

                            // Skip the closing tag, opening tags and attributes
                            oChunk     = pSummary.ParseNext();
                            oChunk     = pSummary.ParseNext();
                            oChunk     = pSummary.ParseNext();
                            p.NumAtoms = Int32.Parse(oChunk.oHTML.ToString());

                            // Skip the closing tag, opening tags and attributes
                            oChunk          = pSummary.ParseNext();
                            oChunk          = pSummary.ParseNext();
                            oChunk          = pSummary.ParseNext();
                            p.PreferredDays = Int32.Parse(oChunk.oHTML.ToString().Substring(0, oChunk.oHTML.IndexOf('.')).Trim());

                            // Skip the closing tag, opening tags and attributes
                            oChunk = pSummary.ParseNext();
                            oChunk = pSummary.ParseNext();
                            oChunk = pSummary.ParseNext();
                            try
                            {
                                p.MaxDays = Int32.Parse(oChunk.oHTML.ToString().Substring(0, oChunk.oHTML.IndexOf('.')).Trim());
                            }
                            catch
                            {
                                p.MaxDays = 0;
                            }

                            oChunk   = pSummary.ParseNext();
                            oChunk   = pSummary.ParseNext();
                            oChunk   = pSummary.ParseNext();
                            p.Credit = Int32.Parse(oChunk.oHTML.ToString().Substring(0, oChunk.oHTML.IndexOf('.')).Trim());

                            oChunk   = pSummary.ParseNext();
                            oChunk   = pSummary.ParseNext();
                            oChunk   = pSummary.ParseNext();
                            p.Frames = Int32.Parse(oChunk.oHTML.ToString().Trim());

                            oChunk = pSummary.ParseNext();
                            oChunk = pSummary.ParseNext();
                            oChunk = pSummary.ParseNext();
                            p.Core = oChunk.oHTML.ToString();

                            oChunk        = pSummary.ParseNext();
                            oChunk        = pSummary.ParseNext();
                            oChunk        = pSummary.ParseNext();
                            p.Description = oChunk.oParams["href"].ToString();

                            oChunk    = pSummary.ParseNext();
                            oChunk    = pSummary.ParseNext();
                            oChunk    = pSummary.ParseNext();
                            oChunk    = pSummary.ParseNext();
                            oChunk    = pSummary.ParseNext();
                            oChunk    = pSummary.ParseNext();
                            p.Contact = oChunk.oHTML.ToString();
                            #endregion

                            if (this.ContainsKey(p.ProjectNumber))
                            {
                                this[p.ProjectNumber] = p;
                            }
                            else
                            {
                                this.Add(p.ProjectNumber, p);
                            }
                        }
                        catch (Exception Ex)
                        {
                            // Ignore this row of the table - unparseable
                            ClassLogger.LogException(LogLevel.Warn, String.Format("{0} threw exception while parsing HTML: {1}", Debug.FunctionName, Ex.Message), null);
                        }
                    }
                }
                if (this.Count > 0)
                {
                    OnNFOUpdated(new NFOUpdatedEventArgs());
                }
            }

            SaveToCSV(_LocalNFOFile);

            ClassLogger.Log(LogLevel.Trace, String.Format("{0} loaded {1} proteins from Stanford", Debug.FunctionName, ProteinCollection.Instance.Count), "");
            ClassLogger.Log(LogLevel.Trace, String.Format("{0} Execution Time: {1}", Debug.FunctionName, Debug.GetExecTime(Start)), "");
            return;
        }
コード例 #24
0
        // ======================================================================
        // Parse and create boxes
        // ======================================================================

        private BlockElement ParseHtmlToBlocks(string html)
        {
            IResourceProvider provider = ServiceRegistry.GetService <IResourceProvider>();
            StyleParser       styles   = new StyleParser(provider);

            BlockElement root, currentBlock;

            root = currentBlock = new BlockElement("root", styles.Style); // this is the root!

            // if this is not HTML, do not parse tags. Otherwise search out and interpret tags.
            bool parseHTML = true;

            if (!parseHTML)
            {
                for (int i = 0; i < html.Length; i++)
                {
                    currentBlock.AddAtom(new CharacterElement(styles.Style, html[i]));
                }
            }
            else
            {
                HTMLparser parser = new HTMLparser(html);
                HTMLchunk  chunk;

                while ((chunk = ParseNext(parser)) != null)
                {
                    if (!(chunk.oHTML == string.Empty))
                    {
                        // This is a span of text.
                        string text = chunk.oHTML;
                        // make sure to replace escape characters!
                        text = EscapeCharacters.ReplaceEscapeCharacters(text);
                        //Add the characters to the current box
                        for (int i = 0; i < text.Length; i++)
                        {
                            currentBlock.AddAtom(new CharacterElement(styles.Style, text[i]));
                        }
                    }
                    else
                    {
                        // This is a tag. interpret the tag and edit the openTags list.
                        // It may also be an atom, in which case we should add it to the list of atoms!
                        AElement atom = null;

                        if (chunk.bClosure && !chunk.bEndClosure)
                        {
                            styles.CloseOneTag(chunk);
                            if (currentBlock.Tag == chunk.sTag)
                            {
                                currentBlock = currentBlock.Parent;
                            }
                        }
                        else
                        {
                            bool isBlockTag = false;
                            switch (chunk.sTag)
                            {
                            // ======================================================================
                            // Anchor elements are added to the open tag collection as HREFs.
                            // ======================================================================
                            case "a":
                                styles.InterpretHREF(chunk, null);
                                break;

                            // ======================================================================
                            // These html elements are ignored.
                            // ======================================================================
                            case "body":
                                break;

                            // ======================================================================
                            // These html elements are blocks but can also have styles
                            // ======================================================================
                            case "center":
                            case "left":
                            case "right":
                            case "div":
                                atom = new BlockElement(chunk.sTag, styles.Style);
                                styles.ParseTag(chunk, atom);
                                isBlockTag = true;
                                break;

                            // ======================================================================
                            // These html elements are styles, and are added to the StyleParser.
                            // ======================================================================
                            case "span":
                            case "font":
                            case "b":
                            case "i":
                            case "u":
                            case "outline":
                            case "big":
                            case "basefont":
                            case "medium":
                            case "small":
                                styles.ParseTag(chunk, null);
                                break;

                            // ======================================================================
                            // These html elements are added as atoms only. They cannot impart style
                            // onto other atoms.
                            // ======================================================================
                            case "br":
                                atom = new CharacterElement(styles.Style, '\n');
                                break;

                            case "gumpimg":
                                // draw a gump image
                                atom = new ImageElement(styles.Style, ImageElement.ImageTypes.UI);
                                styles.ParseTag(chunk, atom);
                                break;

                            case "itemimg":
                                // draw a static image
                                atom = new ImageElement(styles.Style, ImageElement.ImageTypes.Item);
                                styles.ParseTag(chunk, atom);
                                break;

                            // ======================================================================
                            // Every other element is not interpreted, but rendered as text. Easy!
                            // ======================================================================
                            default:
                            {
                                string text = html.Substring(chunk.iChunkOffset, chunk.iChunkLength);
                                // make sure to replace escape characters!
                                text = EscapeCharacters.ReplaceEscapeCharacters(text);
                                //Add the characters to the current box
                                for (int i = 0; i < text.Length; i++)
                                {
                                    currentBlock.AddAtom(new CharacterElement(styles.Style, text[i]));
                                }
                            }
                            break;
                            }

                            if (atom != null)
                            {
                                currentBlock.AddAtom(atom);
                                if (isBlockTag && !chunk.bEndClosure)
                                {
                                    currentBlock = (BlockElement)atom;
                                }
                            }

                            styles.CloseAnySoloTags();
                        }
                    }
                }
            }

            return(root);
        }
コード例 #25
0
 public HtmlParser(HTMLparser parser)
 {
     Parser = parser;
 }
コード例 #26
0
        /// <summary>
        /// Tests parser by parsing chunk of data and then generating HTML on the basis of parsing
        /// and comparing this to expected HTML: in case of any discrepancies assertion will be fired
        /// </summary>
        /// <param name="bData">Data to parse</param>
        /// <param name="sExpectedHTML">Expected HTML as it gets generated by this very function</param>
        void TestParser(byte[] bData, string sExpectedHTML)
        {
            if (sExpectedHTML == null)
            {
                return;
            }

            StringBuilder oSB = new StringBuilder(512);

            bool bEncodingSet = false;

            oP.Init(bData);

            // ok lets parse HTML and save the HTML that we view back into string
            HTMLchunk oChunk;

            // we don't want to use hashes as they would change order in which params are made
            oP.SetChunkHashMode(false);

            // we parse until returned oChunk is null indicating we reached end of parsing
            while ((oChunk = oP.ParseNext()) != null)
            {
                switch (oChunk.oType)
                {
                case HTMLchunkType.OpenTag:

                    oSB.AppendFormat("<{0}", oChunk.sTag);

PrintParams:

                    if (oChunk.sTag.Length == 4 && oChunk.sTag == "meta")
                    {
                        if (!bEncodingSet)
                        {
                            if (HTMLparser.HandleMetaEncoding(oP, oChunk, ref bEncodingSet))
                            {
                                if (bEncodingSet)
                                {
                                    // possible Title re-encoding should happen here
                                }
                            }
                        }
                    }

                    // commented out call to code that will do the job for you - long code below
                    // is left to demonstrate how to access individual param values
                    // Console.WriteLine(oChunk.GenerateParamsHTML());


                    if (oChunk.bHashMode)
                    {
                        if (oChunk.oParams.Count > 0)
                        {
                            foreach (string sParam in oChunk.oParams.Keys)
                            {
                                string sValue = oChunk.oParams[sParam].ToString();

                                if (sValue.Length > 0)
                                {
                                    oSB.AppendFormat(" {0}='{1}'", sParam, oP.ChangeToEntities(sValue));
                                }
                                else
                                {
                                    oSB.AppendFormat(" {0}", sParam);
                                }
                            }
                        }
                    }
                    else
                    {
                        // this is alternative method of getting params -- it may look less convinient
                        // but it saves a LOT of CPU ticks while parsing. It makes sense when you only need
                        // params for a few
                        if (oChunk.iParams > 0)
                        {
                            for (int i = 0; i < oChunk.iParams; i++)
                            {
                                // here we can use exactly the same single/double quotes as they
                                // were used on params

                                string sValue = oChunk.sValues[i];

                                if (oChunk.bEntities)
                                {
                                    sValue = oP.ChangeToEntities(sValue);
                                }

                                switch (oChunk.cParamChars[i])
                                {
                                case (byte)' ':
                                    if (oChunk.sValues[i].Length == 0)
                                    {
                                        oSB.AppendFormat(" {0}", oChunk.sParams[i]);
                                    }
                                    else
                                    {
                                        oSB.AppendFormat(" {0}={1}", oChunk.sParams[i], sValue);
                                    }
                                    break;

                                default:
                                    oSB.AppendFormat(" {0}={1}{2}{1}", oChunk.sParams[i], (char)oChunk.cParamChars[i], sValue);
                                    break;
                                }
                            }
                        }
                    }

                    if (oChunk.bClosure && !oP.bAutoMarkClosedTagsWithParamsAsOpen)
                    {
                        oSB.Append("/>");
                    }
                    else
                    {
                        oSB.Append(">");
                    }
                    break;

                // matched close tag, ie </a>
                case HTMLchunkType.CloseTag:

                    if (oChunk.iParams > 0)
                    {
                        oSB.AppendFormat("<{0}", oChunk.sTag);
                        goto PrintParams;
                    }
                    else
                    {
                        if (oChunk.bEndClosure)
                        {
                            oSB.AppendFormat("<{0}/>", oChunk.sTag);
                        }
                        else
                        {
                            oSB.AppendFormat("</{0}>", oChunk.sTag);
                        }
                    }

                    break;

                // NOTE: you have to call finalisation because it is not done for Scripts or comments
                // Matched data between <script></script> tags
                case HTMLchunkType.Script:

                    if (!oP.bAutoKeepScripts && !oP.bKeepRawHTML)
                    {
                        oP.SetRawHTML(oChunk);
                    }

                    oSB.AppendFormat(oChunk.oHTML);

                    if (oChunk.iParams > 0)
                    {
                        goto PrintParams;
                    }

                    break;

                // NOTE: you have to call finalisation because it is not done for Scripts or comments
                // matched HTML comment, that's stuff between <!-- and -->
                case HTMLchunkType.Comment:
                    if (!oP.bAutoExtractBetweenTagsOnly)
                    {
                        oSB.AppendFormat("{0}", oChunk.oHTML);
                    }
                    else
                    {
                        oSB.AppendFormat("<!--{0}-->", oChunk.oHTML);
                    }
                    break;

                // matched normal text
                case HTMLchunkType.Text:

                    // skip pure whitespace that we are not really interested in
                    if (oP.bCompressWhiteSpaceBeforeTag && oChunk.oHTML.Trim().Length == 0)
                    {
                        continue;
                    }

                    oSB.AppendFormat("{0}", oChunk.bEntities ? oP.ChangeToEntities(oChunk.oHTML) : oChunk.oHTML);
                    break;
                }
                ;
            }

            // now compare parsed HTML with the one we expect
            Assert.AreEqual(sExpectedHTML, oSB.ToString());
        }
コード例 #27
0
        HTMLchunk ParseNext(HTMLparser parser)
        {
            HTMLchunk chunk = parser.ParseNext();

            return(chunk);
        }
コード例 #28
0
ファイル: HTMLparser.cs プロジェクト: pusp/o2platform
		/// <summary>
		/// Handles META tags that set page encoding
		/// </summary>
		/// <param name="oP">HTML parser object that is used for parsing</param>
		/// <param name="oChunk">Parsed chunk that should contain tag META</param>
		/// <param name="bEncodingSet">Your own flag that shows whether encoding was already set or not, if set
		/// once then it should not be changed - this is the logic applied by major browsers</param>
		/// <returns>True if this was META tag setting Encoding, false otherwise</returns>
		public static bool HandleMetaEncoding(HTMLparser oP,HTMLchunk oChunk,ref bool bEncodingSet)
		{
			if(oChunk.sTag.Length!=4 || oChunk.sTag[0]!='m' || oChunk.sTag!="meta")
				return false;

			// if we do not use hashmode already then we call conversion explicitly
			// this is slow, but METAs are very rare so performance penalty is low
			if(!oChunk.bHashMode)
				oChunk.ConvertParamsToHash();

			string sKey=oChunk.oParams["http-equiv"] as string;

			if(sKey!=null)
			{

				// FIXIT: even though this is happening rare I really don't like lower casing stuff
				// that most likely would not need to be - if you feel bored then rewrite this bit
				// to make it faster, it is really easy...
				switch(sKey.ToLower())
				{
					case "content-type":
					// rare case (appears to work in IE) reported to exist in some pages by Martin Bächtold
					case "content-category":

						// we might have charset here that may hint at necessity to decode page
						// check for possible encoding change

						// once encoding is set it should not be changed, but you can be damn
						// sure there are web pages out there that do that!!!
						if(!bEncodingSet)
						{
							string sData=oChunk.oParams["content"] as string;

							// it is possible we have broken META tag without Content part
							if(sData!=null)
							{

								if(oP.SetEncoding(sData))
								{
									// we may need to re-encode title

									if(!bEncodingSet)
									{
										// here you need to reencode any text that you found so far
										// most likely it will be just TITLE, the rest can be ignored anyway
										bEncodingSet=true;
									}
								}
								else
								{
									// failed to set encoding - most likely encoding string
									// was incorrect or your machine lacks codepages or something
									// else - might be good idea to put warning message here
								}
							}

						}

						return true;

					default:
						break;
				};


			}

			return false;
		}
コード例 #29
0
ファイル: HtmlParser.cs プロジェクト: prepare/majestic13
        public HtmlNode Parse(string html)
        {
            // Majestic12 doesn't support doctype
            html = dedoctype.Replace(html, "");
            var builder = new HtmlBuilder();

            var parser = new HTMLparser();

            parser.bDecodeEntities = false;
            parser.SetChunkHashMode(true);

            parser.Init(html);
            var chunk = parser.ParseNext();

            while (chunk != null)
            {
                switch (chunk.oType)
                {
                case HTMLchunkType.OpenTag:
                    // if something goes wrong - ignore it
                    if (chunk.sTag != "")
                    {
                        var attributes = new Dictionary <string, string>();
                        if (chunk.iParams != 0)
                        {
                            foreach (string name in chunk.oParams.Keys)
                            {
                                attributes.Add(name, (string)chunk.oParams[name]);
                            }
                        }
                        builder.OpenTag(chunk.sTag, attributes);
                    }
                    break;

                case HTMLchunkType.Comment:
                    builder.AddComment(chunk.oHTML);
                    break;

                case HTMLchunkType.CloseTag:
                    if (chunk.bEndClosure)
                    {
                        var attr = new Dictionary <string, string>();
                        if (chunk.iParams != 0)
                        {
                            foreach (string name in chunk.oParams.Keys)
                            {
                                attr.Add(name, (string)chunk.oParams[name]);
                            }
                        }
                        builder.OpenTag(chunk.sTag, attr);
                        builder.CloseTag(chunk.sTag);
                    }
                    else
                    {
                        builder.CloseTag(chunk.sTag);
                    }
                    break;

                case HTMLchunkType.Script:
                    builder.AddScript(chunk.oHTML);
                    break;

                case HTMLchunkType.Text:
                    builder.AddText(chunk.oHTML);
                    break;

                default:
                    break;
                }
                chunk = parser.ParseNext();
            }
            return(builder.Render());
        }
コード例 #30
0
        /// <summary>
        /// Handles META tags that set page encoding
        /// </summary>
        /// <param name="oP">HTML parser object that is used for parsing</param>
        /// <param name="oChunk">Parsed chunk that should contain tag META</param>
        /// <param name="bEncodingSet">Your own flag that shows whether encoding was already set or not, if set
        /// once then it should not be changed - this is the logic applied by major browsers</param>
        /// <returns>True if this was META tag setting Encoding, false otherwise</returns>
        public static bool HandleMetaEncoding(HTMLparser oP, HTMLchunk oChunk, ref bool bEncodingSet)
        {
            if (oChunk.sTag.Length != 4 || oChunk.sTag[0] != 'm' || oChunk.sTag != "meta")
            {
                return(false);
            }

            // if we do not use hashmode already then we call conversion explicitly
            // this is slow, but METAs are very rare so performance penalty is low
            if (!oChunk.bHashMode)
            {
                oChunk.ConvertParamsToHash();
            }

            string sKey = oChunk.oParams["http-equiv"] as string;

            if (sKey != null)
            {
                // FIXIT: even though this is happening rare I really don't like lower casing stuff
                // that most likely would not need to be - if you feel bored then rewrite this bit
                // to make it faster, it is really easy...
                switch (sKey.ToLower())
                {
                case "content-type":
                // rare case (appears to work in IE) reported to exist in some pages by Martin Bächtold
                case "content-category":

                    // we might have charset here that may hint at necessity to decode page
                    // check for possible encoding change

                    // once encoding is set it should not be changed, but you can be damn
                    // sure there are web pages out there that do that!!!
                    if (!bEncodingSet)
                    {
                        string sData = oChunk.oParams["content"] as string;

                        // it is possible we have broken META tag without Content part
                        if (sData != null)
                        {
                            if (oP.SetEncoding(sData))
                            {
                                // we may need to re-encode title

                                if (!bEncodingSet)
                                {
                                    // here you need to reencode any text that you found so far
                                    // most likely it will be just TITLE, the rest can be ignored anyway
                                    bEncodingSet = true;
                                }
                            }
                            else
                            {
                                // failed to set encoding - most likely encoding string
                                // was incorrect or your machine lacks codepages or something
                                // else - might be good idea to put warning message here
                            }
                        }
                    }

                    return(true);

                default:
                    break;
                }
                ;
            }

            return(false);
        }