private IBookmarkItem ParseItem(HTMLparser parser) { BookmarkLink item = null; HTMLchunk chunk, prevChunk = parser.CurrentChunk; while ((chunk = parser.ParseNext()) != null) { if (chunk.IsOpenTag && chunk.Tag == "a") { item = new BookmarkLink(); AssignLinkAttributes(item, chunk.oParams); item.Title = GetTextOrDontMove(parser); } else if (chunk.IsOpenTag && chunk.Tag == "dd" && item != null) { item.Description = ParseDescription(parser); } else if (chunk.IsOpenTag && chunk.Tag == "h3") { var folder = new BookmarkFolder(); AssignFolderAttributes(folder, chunk.oParams); folder.Title = GetTextOrDontMove(parser); return(folder); } else if ((chunk.IsOpenTag && chunk.Tag == "dt") || chunk.Tag == "dl") { parser.StepBack(prevChunk); break; } prevChunk = chunk; } return(item); }
internal static HTMLparser GetInstance() { HTMLparser oP = new HTMLparser(); // This is optional, but if you want high performance then you may // want to set chunk hash mode to FALSE. This would result in tag params // being added to string arrays in HTMLchunk object called sParams and sValues, with number // of actual params being in iParams. See code below for details. // // When TRUE (and its default) tag params will be added to hashtable HTMLchunk (object).oParams oP.SetChunkHashMode(false); // if you set this to true then original parsed HTML for given chunk will be kept - // this will reduce performance somewhat, but may be desireable in some cases where // reconstruction of HTML may be necessary oP.bKeepRawHTML = true; // if set to true (it is false by default), then entities will be decoded: this is essential // if you want to get strings that contain final representation of the data in HTML, however // you should be aware that if you want to use such strings into output HTML string then you will // need to do Entity encoding or same string may fail later oP.bDecodeEntities = true; // we have option to keep most entities as is - only replace stuff like // this is called Mini Entities mode - it is handy when HTML will need // to be re-created after it was parsed, though in this case really // entities should not be parsed at all oP.bDecodeMiniEntities = true; if (!oP.bDecodeEntities && oP.bDecodeMiniEntities) { oP.InitMiniEntities(); } // if set to true, then in case of Comments and SCRIPT tags the data set to oHTML will be // extracted BETWEEN those tags, rather than include complete RAW HTML that includes tags too // this only works if auto extraction is enabled oP.bAutoExtractBetweenTagsOnly = true; // if true then comments will be extracted automatically oP.bAutoKeepComments = true; // if true then scripts will be extracted automatically: oP.bAutoKeepScripts = true; // if this option is true then whitespace before start of tag will be compressed to single // space character in string: " ", if false then full whitespace before tag will be returned (slower) // you may only want to set it to false if you want exact whitespace between tags, otherwise it is just // a waste of CPU cycles oP.bCompressWhiteSpaceBeforeTag = true; // if true (default) then tags with attributes marked as CLOSED (/ at the end) will be automatically // forced to be considered as open tags - this is no good for XML parsing, but I keep it for backwards // compatibility for my stuff as it makes it easier to avoid checking for same tag which is both closed // or open oP.bAutoMarkClosedTagsWithParamsAsOpen = false; return(oP); }
public HtmlLookup(string doc) { if (string.IsNullOrEmpty(doc) == false) { _parser = new HTMLparser(); _parser.Init(doc); } }
public void Dispose() { if (_parser != null) { _parser.Close(); _parser.Dispose(); _parser = null; } }
private string ParseDescription(HTMLparser parser) { var chunk = parser.ParseNext(); if (chunk != null && chunk.IsText && !string.IsNullOrWhiteSpace(chunk.HTML)) { return(chunk.HTML.Trim()); } return(null); }
private BookmarkFolder Parse(byte[] content) { var parser = new HTMLparser(content) { DecodeEntities = true }; var rootFolder = ParseFolder(parser, null, true); return(rootFolder); }
private string GetTextOrDontMove(HTMLparser parser) { var textChunk = parser.ParseNext(); if (textChunk.IsText) { return(textChunk.HTML); } parser.StepBack(textChunk); return(null); }
public void FontSizes() { // test font size calculation Assert.AreEqual(HTMLparser.FontSize.Large, HTMLparser.ParseFontSize("+1", HTMLparser.FontSize.Medium)); Assert.AreEqual(HTMLparser.FontSize.Small, HTMLparser.ParseFontSize("-1", HTMLparser.FontSize.Medium)); Assert.AreEqual(HTMLparser.FontSize.Unknown, HTMLparser.ParseFontSize("", HTMLparser.FontSize.Medium)); Assert.AreEqual(HTMLparser.FontSize.Unknown, HTMLparser.ParseFontSize("ald", HTMLparser.FontSize.Medium)); Assert.IsFalse(HTMLparser.IsBiggerFont(HTMLparser.FontSize.Small, HTMLparser.FontSize.Large)); Assert.IsTrue(HTMLparser.IsBiggerFont(HTMLparser.FontSize.Medium, HTMLparser.FontSize.Small)); Assert.IsTrue(HTMLparser.IsEqualOrBiggerFont(HTMLparser.FontSize.Medium, HTMLparser.FontSize.Medium)); Assert.IsTrue(HTMLparser.IsEqualOrBiggerFont(HTMLparser.FontSize.Medium, HTMLparser.FontSize.Small)); }
public void CreateParser() { if (oP != null) { oP.Close(); oP = null; } oP = new HTMLparser(); oP.bDecodeEntities = true; // dummy assertion Assert.IsNotNull(oP); }
public void Widths() { // try calculation of widths bool bRelative = false; Assert.AreEqual(500, HTMLparser.CalculateWidth("500", 1000, ref bRelative)); Assert.IsFalse(bRelative); Assert.AreEqual(1000, HTMLparser.CalculateWidth("", 1000, ref bRelative)); Assert.AreEqual(1000, HTMLparser.CalculateWidth("10000000000000", 1000, ref bRelative)); Assert.AreEqual(500, HTMLparser.CalculateWidth("50%", 1000, ref bRelative)); Assert.IsTrue(bRelative); }
public SmartMe.Core.Data.DictResult Parse(string html, Encoding encoding) { dictResult = new DictResult(); HTMLparser oP = HtmlParserFactory.GetInstance(); dictResult.DictionaryType = DictionaryType.Dict_cn; oP.Init(encoding.GetBytes(html)); oP.SetEncoding(encoding); HTMLchunk oChunk = null; int state = 0; bool bEncodingSet = false; while ((oChunk = oP.ParseNext()) != null) { switch (oChunk.oType) { case HTMLchunkType.OpenTag: HandleOpenTag(oChunk, ref state); printParams: if (oChunk.sTag == "meta") { HandleMetaEncoding(oP, oChunk, ref bEncodingSet); } ; HandleParam(oChunk, ref state); break; case HTMLchunkType.CloseTag: HandleCloseTag(oChunk, ref state); break; case HTMLchunkType.Text: HandleText(oChunk, ref state); break; default: break; } } return(dictResult); }
public SearchEngineResult Parse(string html, Encoding encoding) { HTMLparser oP = HtmlParserFactory.GetInstance(); searchResult = new SearchEngineResult(); searchResult.SearchEngineType = SearchEngineType.Baidu; item = new SearchEngineResult.ResultItem(); //item.Source = "Baidu"; oP.Init(encoding.GetBytes(html)); oP.SetEncoding(encoding); HTMLchunk oChunk = null; int state = 0; bool bEncodingSet = false; while ((oChunk = oP.ParseNext()) != null) { switch (oChunk.oType) { case HTMLchunkType.OpenTag: HandleOpenTag(oChunk, ref state); printParams: if (oChunk.sTag == "meta") { HandleMetaEncoding(oP, oChunk, ref bEncodingSet); } ; HandleParam(oChunk, ref state); break; case HTMLchunkType.CloseTag: HandleCloseTag(oChunk, ref state); break; case HTMLchunkType.Text: HandleText(oChunk, ref state); break; default: break; } } return(searchResult); }
private BookmarkFolder ParseFolder(HTMLparser parser, BookmarkFolder folderBase, bool root = false) { var folder = folderBase ?? new BookmarkFolder(); folderBase = null; AssignFolderAttributes(folder, folder.Attributes); HTMLchunk chunk; while ((chunk = parser.ParseNext()) != null) { if (chunk.Type == HTMLchunkType.OpenTag && chunk.Tag == "dt") { var item = ParseItem(parser); if (item != null) { if (item is BookmarkFolder) { folderBase = item as BookmarkFolder; } else { folder.Add(item); } } } else if (chunk.IsOpenTag && chunk.Tag == "dl") { if (root) { folder = ParseFolder(parser, folderBase); root = false; } else { var newFolder = ParseFolder(parser, folderBase); folder.Add(newFolder); } } else if (chunk.IsCloseTag && chunk.Tag == "dl") { return(folder); } } return(folder); }
private void HandleMetaEncoding(HTMLparser oP, HTMLchunk oChunk, ref bool bEncodingSet) { // if encoding already set then we should not be trying to set new one // this is the logic that major browsers follow - the first Encoding is assumed to be // the correct one if (bEncodingSet) { return; } if (HTMLparser.HandleMetaEncoding(oP, oChunk, ref bEncodingSet)) { if (!bEncodingSet) { Console.WriteLine("Failed to set encoding from META: {0}", oChunk.GenerateHTML()); } } }
private static List <Protein> ParseProteins(string html) { var htmlParser = new HTMLparser(); htmlParser.Init(html); var list = new List <Protein>(); HTMLchunk chunk; while ((chunk = htmlParser.ParseNext()) != null) { // Look for an Open "tr" Tag if (chunk.oType.Equals(HTMLchunkType.OpenTag) && chunk.sTag.ToLower() == "tr") { var p = new Protein(); int projectNumber; if (Int32.TryParse(GetNextTdValue(htmlParser), NumberStyles.Integer, CultureInfo.InvariantCulture, out projectNumber)) { p.ProjectNumber = projectNumber; } else { continue; } p.ServerIP = GetNextTdValue(htmlParser); p.WorkUnitName = GetNextTdValue(htmlParser); p.NumberOfAtoms = ToInt32OrDefault(GetNextTdValue(htmlParser)); p.PreferredDays = ToDoubleOrDefault(GetNextTdValue(htmlParser)); p.MaximumDays = ToDoubleOrDefault(GetNextTdValue(htmlParser)); p.Credit = ToDoubleOrDefault(GetNextTdValue(htmlParser)); p.Frames = ToInt32OrDefault(GetNextTdValue(htmlParser)); p.Core = GetNextTdValue(htmlParser); p.Description = GetNextTdValue(htmlParser, "href"); p.Contact = GetNextTdValue(htmlParser); p.KFactor = ToDoubleOrDefault(GetNextTdValue(htmlParser)); list.Add(p); } } return(list); }
public void Open(Session session) { String charset = "utf-8"; Parser = new HTMLparser(); try { if (Utility.IsResponseHtml(session) || Utility.IsResponseXml(session)) { Parser.Init(session.responseBodyBytes == null ? new byte[] { } : session.responseBodyBytes); Parser.bAutoKeepScripts = true; Parser.bEnableHeuristics = false; // When bAutoExtractBetweenTagsOnly is false, the parser will see attributes // in the script tags, such as <script src="mydata">. Otherwise it will not. Parser.bAutoExtractBetweenTagsOnly = true; } } catch (Exception e) { Trace.TraceWarning("Warning: UtilityHtmlParser threw an unhandled exception: {0}", e.Message); ExceptionLogger.HandleException(e); } // Get the encoding name from the HTML or HTTP charset = Utility.GetHtmlCharset(session); try { // TODO: check if the encoding is a known good before continuing!!! // See if the charset name we got is a valid system encoding name. // GetEncoding should throw an Argument ex if not. Encoding e = Encoding.GetEncoding(charset); Parser.SetEncoding(charset); } catch (ArgumentException e) { // Default to utf-8 if Parser.SetEncoding(new UTF8Encoding(false, false)); } }
public static string GetNextValue(HTMLparser htmlParser, string tagName, string paramName) { HTMLchunk oChunk; while ((oChunk = htmlParser.ParseNext()) != null) { // Look for an Open Tag matching the given Tag Name if (oChunk.oType.Equals(HTMLchunkType.OpenTag) && oChunk.sTag.ToLower() == tagName) { // If not looking for a Tag Parameter if (paramName.Length == 0) { // Look inside the "td" Tag oChunk = htmlParser.ParseNext(); if (oChunk != null) { // If it's an Open "font" Tag if (oChunk.oType.Equals(HTMLchunkType.OpenTag) && oChunk.sTag.ToLower() == "font") { // Look inside the "font" Tag oChunk = htmlParser.ParseNext(); // If it's Text, return it if (oChunk != null && oChunk.oType.Equals(HTMLchunkType.Text)) { return(oChunk.oHTML.Trim()); } } // If it's Text, return it else if (oChunk.oType.Equals(HTMLchunkType.Text)) { return(oChunk.oHTML.Trim()); } } } // Looking for a Tag Parameter else { // Look inside the "td" Tag oChunk = htmlParser.ParseNext(); // If it's an Open Tag if (oChunk != null && oChunk.oType.Equals(HTMLchunkType.OpenTag) && oChunk.oParams.Contains(paramName)) { // Return the specified Parameter Name return(oChunk.oParams[paramName].ToString()); } } return(String.Empty); } } //throw new InvalidOperationException("Could not complete operation to get td tag value."); return(String.Empty); }
public static string GetNextThValue(HTMLparser htmlParser) { return(GetNextValue(htmlParser, "th", String.Empty)); }
public static string GetNextTdValue(HTMLparser htmlParser, string paramName) { return(GetNextValue(htmlParser, "td", paramName)); }
public static string GetNextTdValue(HTMLparser pSummary) { return(GetNextTdValue(pSummary, String.Empty)); }
private bool RequestSstFile() { try { DateTime dt = DateTime.Today.AddDays(-10); string getUrl = ReplaceMacros(_noaaSstGetUrl, dt); string response = ""; ServicePointManager.ServerCertificateValidationCallback += ValidateRemoteCertificate; ServicePointManager.SecurityProtocol = (SecurityProtocolType)3072; //TLS 1.2 using (WebClient wc = new WebClient()) { response = wc.DownloadString(getUrl); } HTMLparser parser = new HTMLparser(); parser.Init(response); HTMLchunk chunk = null; while ((chunk = parser.ParseNext()) != null) { if (chunk.sTag != "a") { continue; } if (chunk.oType != HTMLchunkType.OpenTag) { continue; } if (chunk.oParams == null || chunk.oParams.Count < 1) { continue; } string href = chunk.oParams["href"]?.ToString(); if (string.IsNullOrEmpty(href)) { continue; } if (href.StartsWith("ftp://")) { string file = Path.Combine(SimulationData.WorkFolder, "SST.nc"); if (File.Exists(file) == false) { using (WebClient wc = new WebClient()) { wc.DownloadFile(href, file); } } } } } catch (Exception ex) { int s = 0; } return(false); }
static public IEnumerable <XNode> ConvertNodesToXml(byte[] htmlAsBytes) { HTMLparser parser = OpenParser(); parser.Init(htmlAsBytes); XElement currentNode = new XElement("document"); HTMLchunk m12chunk = null; int xmlnsAttributeIndex = 0; string originalHtml = ""; while ((m12chunk = parser.ParseNext()) != null) { try { Debug.Assert(!m12chunk.bHashMode); // popular default for Majestic-12 setting XNode newNode = null; XElement newNodesParent = null; switch (m12chunk.oType) { case HTMLchunkType.OpenTag: // Tags are added as a child to the current tag, // except when the new tag implies the closure of // some number of ancestor tags. newNode = ParseTagNode(m12chunk, originalHtml, ref xmlnsAttributeIndex); if (newNode != null) { currentNode = FindParentOfNewNode(m12chunk, originalHtml, currentNode); newNodesParent = currentNode; newNodesParent.Add(newNode); currentNode = newNode as XElement; } break; case HTMLchunkType.CloseTag: if (m12chunk.bEndClosure) { newNode = ParseTagNode(m12chunk, originalHtml, ref xmlnsAttributeIndex); if (newNode != null) { currentNode = FindParentOfNewNode(m12chunk, originalHtml, currentNode); newNodesParent = currentNode; newNodesParent.Add(newNode); } } else { XElement nodeToClose = currentNode; string m12chunkCleanedTag = CleanupTagName(m12chunk.sTag, originalHtml); while (nodeToClose != null && nodeToClose.Name.LocalName != m12chunkCleanedTag) { nodeToClose = nodeToClose.Parent; } if (nodeToClose != null) { currentNode = nodeToClose.Parent; } Debug.Assert(currentNode != null); } break; case HTMLchunkType.Script: newNode = new XElement("script", "REMOVED"); newNodesParent = currentNode; newNodesParent.Add(newNode); break; case HTMLchunkType.Comment: newNodesParent = currentNode; if (m12chunk.sTag == "!--") { newNode = new XComment(m12chunk.oHTML); } else if (m12chunk.sTag == "![CDATA[") { newNode = new XCData(m12chunk.oHTML); } else { throw new Exception("Unrecognized comment sTag"); } newNodesParent.Add(newNode); break; case HTMLchunkType.Text: currentNode.Add(m12chunk.oHTML); break; default: break; } } catch (Exception e) { var wrappedE = new Exception("Error using Majestic12.HTMLChunk, reason: " + e.Message, e); // the original html is copied for tracing/debugging purposes originalHtml = new string(htmlAsBytes.Skip(m12chunk.iChunkOffset) .Take(m12chunk.iChunkLength) .Select(B => (char)B).ToArray()); wrappedE.Data.Add("source", originalHtml); throw wrappedE; } } while (currentNode.Parent != null) { currentNode = currentNode.Parent; } return(currentNode.Nodes()); }
/// <summary> /// Download project information from Stanford University (psummaryC.html) /// </summary> /// <param name="State">Null in this implementation</param> public void DownloadFromStanford(Object State /* null */) { DateTime Start = Debug.ExecStart; lock (this) { Preferences.PreferenceSet Prefs = Preferences.PreferenceSet.Instance; WebRequest wrq = (WebRequest)WebRequest.Create("http://vspx27.stanford.edu/psummaryC.html"); wrq.Method = WebRequestMethods.Http.Get; WebResponse wrs; StreamReader sr1; if (Prefs.UseProxy) { wrq.Proxy = new WebProxy(Prefs.ProxyServer, Prefs.ProxyPort); if (Prefs.UseProxyAuth) { wrq.Proxy.Credentials = new NetworkCredential(Prefs.ProxyUser, Prefs.ProxyPass); } } else { wrq.Proxy = null; } // TODO: Handle timeouts and errors try { wrs = (WebResponse)wrq.GetResponse(); sr1 = new StreamReader(wrs.GetResponseStream(), Encoding.ASCII); if ((wrs == null) || (sr1 == null)) { throw new IOException("The web response or stream was null"); } } catch (WebException ExWeb) { ClassLogger.LogException(LogLevel.Warn, String.Format("{0} threw WebException {1}.", Debug.FunctionName, ExWeb.Message), null); ClassLogger.Log(LogLevel.Trace, String.Format("{0} Execution Time: {1}", Debug.FunctionName, Debug.GetExecTime(Start)), ""); return; } catch (IOException ExIO) { ClassLogger.LogException(LogLevel.Warn, String.Format("{0} threw IOException {1}.", Debug.FunctionName, ExIO.Message), null); ClassLogger.Log(LogLevel.Trace, String.Format("{0} Execution Time: {1}", Debug.FunctionName, Debug.GetExecTime(Start)), ""); return; } catch (Exception Ex) { ClassLogger.LogException(LogLevel.Warn, String.Format("{0} threw WebException {1}.", Debug.FunctionName, Ex.Message), null); ClassLogger.Log(LogLevel.Trace, String.Format("{0} Execution Time: {1}", Debug.FunctionName, Debug.GetExecTime(Start)), ""); return; } HTMLparser pSummary = new HTMLparser(); String sSummaryPage = sr1.ReadToEnd(); pSummary.Init(sSummaryPage); // Locate the table HTMLchunk oChunk = null; // Parse until returned oChunk is null indicating we reached end of parsing while ((oChunk = pSummary.ParseNext()) != null) { if (oChunk.sTag.ToLower() == "tr") { Protein p = new Protein(); while (((oChunk = pSummary.ParseNext()) != null) && (oChunk.sTag.ToLower() != "td")) { ; // Do nothing! } // Skip the empty attributes oChunk = pSummary.ParseNext(); try { #region Parse Code for HTML Table // Suck out the project number p.ProjectNumber = Int32.Parse(oChunk.oHTML.ToString()); // Skip the closing tag, opening tags and attributes oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); p.ServerIP = oChunk.oHTML.ToString().Trim(); // Skip the closing tag, opening tags and attributes oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); p.WorkUnitName = oChunk.oHTML.ToString().Trim(); // Skip the closing tag, opening tags and attributes oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); p.NumAtoms = Int32.Parse(oChunk.oHTML.ToString()); // Skip the closing tag, opening tags and attributes oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); p.PreferredDays = Int32.Parse(oChunk.oHTML.ToString().Substring(0, oChunk.oHTML.IndexOf('.')).Trim()); // Skip the closing tag, opening tags and attributes oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); try { p.MaxDays = Int32.Parse(oChunk.oHTML.ToString().Substring(0, oChunk.oHTML.IndexOf('.')).Trim()); } catch { p.MaxDays = 0; } oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); p.Credit = Int32.Parse(oChunk.oHTML.ToString().Substring(0, oChunk.oHTML.IndexOf('.')).Trim()); oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); p.Frames = Int32.Parse(oChunk.oHTML.ToString().Trim()); oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); p.Core = oChunk.oHTML.ToString(); oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); p.Description = oChunk.oParams["href"].ToString(); oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); p.Contact = oChunk.oHTML.ToString(); #endregion if (this.ContainsKey(p.ProjectNumber)) { this[p.ProjectNumber] = p; } else { this.Add(p.ProjectNumber, p); } } catch (Exception Ex) { // Ignore this row of the table - unparseable ClassLogger.LogException(LogLevel.Warn, String.Format("{0} threw exception while parsing HTML: {1}", Debug.FunctionName, Ex.Message), null); } } } if (this.Count > 0) { OnNFOUpdated(new NFOUpdatedEventArgs()); } } SaveToCSV(_LocalNFOFile); ClassLogger.Log(LogLevel.Trace, String.Format("{0} loaded {1} proteins from Stanford", Debug.FunctionName, ProteinCollection.Instance.Count), ""); ClassLogger.Log(LogLevel.Trace, String.Format("{0} Execution Time: {1}", Debug.FunctionName, Debug.GetExecTime(Start)), ""); return; }
// ====================================================================== // Parse and create boxes // ====================================================================== private BlockElement ParseHtmlToBlocks(string html) { IResourceProvider provider = ServiceRegistry.GetService <IResourceProvider>(); StyleParser styles = new StyleParser(provider); BlockElement root, currentBlock; root = currentBlock = new BlockElement("root", styles.Style); // this is the root! // if this is not HTML, do not parse tags. Otherwise search out and interpret tags. bool parseHTML = true; if (!parseHTML) { for (int i = 0; i < html.Length; i++) { currentBlock.AddAtom(new CharacterElement(styles.Style, html[i])); } } else { HTMLparser parser = new HTMLparser(html); HTMLchunk chunk; while ((chunk = ParseNext(parser)) != null) { if (!(chunk.oHTML == string.Empty)) { // This is a span of text. string text = chunk.oHTML; // make sure to replace escape characters! text = EscapeCharacters.ReplaceEscapeCharacters(text); //Add the characters to the current box for (int i = 0; i < text.Length; i++) { currentBlock.AddAtom(new CharacterElement(styles.Style, text[i])); } } else { // This is a tag. interpret the tag and edit the openTags list. // It may also be an atom, in which case we should add it to the list of atoms! AElement atom = null; if (chunk.bClosure && !chunk.bEndClosure) { styles.CloseOneTag(chunk); if (currentBlock.Tag == chunk.sTag) { currentBlock = currentBlock.Parent; } } else { bool isBlockTag = false; switch (chunk.sTag) { // ====================================================================== // Anchor elements are added to the open tag collection as HREFs. // ====================================================================== case "a": styles.InterpretHREF(chunk, null); break; // ====================================================================== // These html elements are ignored. // ====================================================================== case "body": break; // ====================================================================== // These html elements are blocks but can also have styles // ====================================================================== case "center": case "left": case "right": case "div": atom = new BlockElement(chunk.sTag, styles.Style); styles.ParseTag(chunk, atom); isBlockTag = true; break; // ====================================================================== // These html elements are styles, and are added to the StyleParser. // ====================================================================== case "span": case "font": case "b": case "i": case "u": case "outline": case "big": case "basefont": case "medium": case "small": styles.ParseTag(chunk, null); break; // ====================================================================== // These html elements are added as atoms only. They cannot impart style // onto other atoms. // ====================================================================== case "br": atom = new CharacterElement(styles.Style, '\n'); break; case "gumpimg": // draw a gump image atom = new ImageElement(styles.Style, ImageElement.ImageTypes.UI); styles.ParseTag(chunk, atom); break; case "itemimg": // draw a static image atom = new ImageElement(styles.Style, ImageElement.ImageTypes.Item); styles.ParseTag(chunk, atom); break; // ====================================================================== // Every other element is not interpreted, but rendered as text. Easy! // ====================================================================== default: { string text = html.Substring(chunk.iChunkOffset, chunk.iChunkLength); // make sure to replace escape characters! text = EscapeCharacters.ReplaceEscapeCharacters(text); //Add the characters to the current box for (int i = 0; i < text.Length; i++) { currentBlock.AddAtom(new CharacterElement(styles.Style, text[i])); } } break; } if (atom != null) { currentBlock.AddAtom(atom); if (isBlockTag && !chunk.bEndClosure) { currentBlock = (BlockElement)atom; } } styles.CloseAnySoloTags(); } } } } return(root); }
public HtmlParser(HTMLparser parser) { Parser = parser; }
/// <summary> /// Tests parser by parsing chunk of data and then generating HTML on the basis of parsing /// and comparing this to expected HTML: in case of any discrepancies assertion will be fired /// </summary> /// <param name="bData">Data to parse</param> /// <param name="sExpectedHTML">Expected HTML as it gets generated by this very function</param> void TestParser(byte[] bData, string sExpectedHTML) { if (sExpectedHTML == null) { return; } StringBuilder oSB = new StringBuilder(512); bool bEncodingSet = false; oP.Init(bData); // ok lets parse HTML and save the HTML that we view back into string HTMLchunk oChunk; // we don't want to use hashes as they would change order in which params are made oP.SetChunkHashMode(false); // we parse until returned oChunk is null indicating we reached end of parsing while ((oChunk = oP.ParseNext()) != null) { switch (oChunk.oType) { case HTMLchunkType.OpenTag: oSB.AppendFormat("<{0}", oChunk.sTag); PrintParams: if (oChunk.sTag.Length == 4 && oChunk.sTag == "meta") { if (!bEncodingSet) { if (HTMLparser.HandleMetaEncoding(oP, oChunk, ref bEncodingSet)) { if (bEncodingSet) { // possible Title re-encoding should happen here } } } } // commented out call to code that will do the job for you - long code below // is left to demonstrate how to access individual param values // Console.WriteLine(oChunk.GenerateParamsHTML()); if (oChunk.bHashMode) { if (oChunk.oParams.Count > 0) { foreach (string sParam in oChunk.oParams.Keys) { string sValue = oChunk.oParams[sParam].ToString(); if (sValue.Length > 0) { oSB.AppendFormat(" {0}='{1}'", sParam, oP.ChangeToEntities(sValue)); } else { oSB.AppendFormat(" {0}", sParam); } } } } else { // this is alternative method of getting params -- it may look less convinient // but it saves a LOT of CPU ticks while parsing. It makes sense when you only need // params for a few if (oChunk.iParams > 0) { for (int i = 0; i < oChunk.iParams; i++) { // here we can use exactly the same single/double quotes as they // were used on params string sValue = oChunk.sValues[i]; if (oChunk.bEntities) { sValue = oP.ChangeToEntities(sValue); } switch (oChunk.cParamChars[i]) { case (byte)' ': if (oChunk.sValues[i].Length == 0) { oSB.AppendFormat(" {0}", oChunk.sParams[i]); } else { oSB.AppendFormat(" {0}={1}", oChunk.sParams[i], sValue); } break; default: oSB.AppendFormat(" {0}={1}{2}{1}", oChunk.sParams[i], (char)oChunk.cParamChars[i], sValue); break; } } } } if (oChunk.bClosure && !oP.bAutoMarkClosedTagsWithParamsAsOpen) { oSB.Append("/>"); } else { oSB.Append(">"); } break; // matched close tag, ie </a> case HTMLchunkType.CloseTag: if (oChunk.iParams > 0) { oSB.AppendFormat("<{0}", oChunk.sTag); goto PrintParams; } else { if (oChunk.bEndClosure) { oSB.AppendFormat("<{0}/>", oChunk.sTag); } else { oSB.AppendFormat("</{0}>", oChunk.sTag); } } break; // NOTE: you have to call finalisation because it is not done for Scripts or comments // Matched data between <script></script> tags case HTMLchunkType.Script: if (!oP.bAutoKeepScripts && !oP.bKeepRawHTML) { oP.SetRawHTML(oChunk); } oSB.AppendFormat(oChunk.oHTML); if (oChunk.iParams > 0) { goto PrintParams; } break; // NOTE: you have to call finalisation because it is not done for Scripts or comments // matched HTML comment, that's stuff between <!-- and --> case HTMLchunkType.Comment: if (!oP.bAutoExtractBetweenTagsOnly) { oSB.AppendFormat("{0}", oChunk.oHTML); } else { oSB.AppendFormat("<!--{0}-->", oChunk.oHTML); } break; // matched normal text case HTMLchunkType.Text: // skip pure whitespace that we are not really interested in if (oP.bCompressWhiteSpaceBeforeTag && oChunk.oHTML.Trim().Length == 0) { continue; } oSB.AppendFormat("{0}", oChunk.bEntities ? oP.ChangeToEntities(oChunk.oHTML) : oChunk.oHTML); break; } ; } // now compare parsed HTML with the one we expect Assert.AreEqual(sExpectedHTML, oSB.ToString()); }
HTMLchunk ParseNext(HTMLparser parser) { HTMLchunk chunk = parser.ParseNext(); return(chunk); }
/// <summary> /// Handles META tags that set page encoding /// </summary> /// <param name="oP">HTML parser object that is used for parsing</param> /// <param name="oChunk">Parsed chunk that should contain tag META</param> /// <param name="bEncodingSet">Your own flag that shows whether encoding was already set or not, if set /// once then it should not be changed - this is the logic applied by major browsers</param> /// <returns>True if this was META tag setting Encoding, false otherwise</returns> public static bool HandleMetaEncoding(HTMLparser oP,HTMLchunk oChunk,ref bool bEncodingSet) { if(oChunk.sTag.Length!=4 || oChunk.sTag[0]!='m' || oChunk.sTag!="meta") return false; // if we do not use hashmode already then we call conversion explicitly // this is slow, but METAs are very rare so performance penalty is low if(!oChunk.bHashMode) oChunk.ConvertParamsToHash(); string sKey=oChunk.oParams["http-equiv"] as string; if(sKey!=null) { // FIXIT: even though this is happening rare I really don't like lower casing stuff // that most likely would not need to be - if you feel bored then rewrite this bit // to make it faster, it is really easy... switch(sKey.ToLower()) { case "content-type": // rare case (appears to work in IE) reported to exist in some pages by Martin Bächtold case "content-category": // we might have charset here that may hint at necessity to decode page // check for possible encoding change // once encoding is set it should not be changed, but you can be damn // sure there are web pages out there that do that!!! if(!bEncodingSet) { string sData=oChunk.oParams["content"] as string; // it is possible we have broken META tag without Content part if(sData!=null) { if(oP.SetEncoding(sData)) { // we may need to re-encode title if(!bEncodingSet) { // here you need to reencode any text that you found so far // most likely it will be just TITLE, the rest can be ignored anyway bEncodingSet=true; } } else { // failed to set encoding - most likely encoding string // was incorrect or your machine lacks codepages or something // else - might be good idea to put warning message here } } } return true; default: break; }; } return false; }
public HtmlNode Parse(string html) { // Majestic12 doesn't support doctype html = dedoctype.Replace(html, ""); var builder = new HtmlBuilder(); var parser = new HTMLparser(); parser.bDecodeEntities = false; parser.SetChunkHashMode(true); parser.Init(html); var chunk = parser.ParseNext(); while (chunk != null) { switch (chunk.oType) { case HTMLchunkType.OpenTag: // if something goes wrong - ignore it if (chunk.sTag != "") { var attributes = new Dictionary <string, string>(); if (chunk.iParams != 0) { foreach (string name in chunk.oParams.Keys) { attributes.Add(name, (string)chunk.oParams[name]); } } builder.OpenTag(chunk.sTag, attributes); } break; case HTMLchunkType.Comment: builder.AddComment(chunk.oHTML); break; case HTMLchunkType.CloseTag: if (chunk.bEndClosure) { var attr = new Dictionary <string, string>(); if (chunk.iParams != 0) { foreach (string name in chunk.oParams.Keys) { attr.Add(name, (string)chunk.oParams[name]); } } builder.OpenTag(chunk.sTag, attr); builder.CloseTag(chunk.sTag); } else { builder.CloseTag(chunk.sTag); } break; case HTMLchunkType.Script: builder.AddScript(chunk.oHTML); break; case HTMLchunkType.Text: builder.AddText(chunk.oHTML); break; default: break; } chunk = parser.ParseNext(); } return(builder.Render()); }
/// <summary> /// Handles META tags that set page encoding /// </summary> /// <param name="oP">HTML parser object that is used for parsing</param> /// <param name="oChunk">Parsed chunk that should contain tag META</param> /// <param name="bEncodingSet">Your own flag that shows whether encoding was already set or not, if set /// once then it should not be changed - this is the logic applied by major browsers</param> /// <returns>True if this was META tag setting Encoding, false otherwise</returns> public static bool HandleMetaEncoding(HTMLparser oP, HTMLchunk oChunk, ref bool bEncodingSet) { if (oChunk.sTag.Length != 4 || oChunk.sTag[0] != 'm' || oChunk.sTag != "meta") { return(false); } // if we do not use hashmode already then we call conversion explicitly // this is slow, but METAs are very rare so performance penalty is low if (!oChunk.bHashMode) { oChunk.ConvertParamsToHash(); } string sKey = oChunk.oParams["http-equiv"] as string; if (sKey != null) { // FIXIT: even though this is happening rare I really don't like lower casing stuff // that most likely would not need to be - if you feel bored then rewrite this bit // to make it faster, it is really easy... switch (sKey.ToLower()) { case "content-type": // rare case (appears to work in IE) reported to exist in some pages by Martin Bächtold case "content-category": // we might have charset here that may hint at necessity to decode page // check for possible encoding change // once encoding is set it should not be changed, but you can be damn // sure there are web pages out there that do that!!! if (!bEncodingSet) { string sData = oChunk.oParams["content"] as string; // it is possible we have broken META tag without Content part if (sData != null) { if (oP.SetEncoding(sData)) { // we may need to re-encode title if (!bEncodingSet) { // here you need to reencode any text that you found so far // most likely it will be just TITLE, the rest can be ignored anyway bEncodingSet = true; } } else { // failed to set encoding - most likely encoding string // was incorrect or your machine lacks codepages or something // else - might be good idea to put warning message here } } } return(true); default: break; } ; } return(false); }