public HtmlLookup(string doc) { if (string.IsNullOrEmpty(doc) == false) { _parser = new HTMLparser(); _parser.Init(doc); } }
public SearchEngineResult Parse(string html, Encoding encoding) { HTMLparser oP = HtmlParserFactory.GetInstance(); searchResult = new SearchEngineResult(); searchResult.SearchEngineType = SearchEngineType.Baidu; item = new SearchEngineResult.ResultItem(); //item.Source = "Baidu"; oP.Init(encoding.GetBytes(html)); oP.SetEncoding(encoding); HTMLchunk oChunk = null; int state = 0; bool bEncodingSet = false; while ((oChunk = oP.ParseNext()) != null) { switch (oChunk.oType) { case HTMLchunkType.OpenTag: HandleOpenTag(oChunk, ref state); printParams: if (oChunk.sTag == "meta") { HandleMetaEncoding(oP, oChunk, ref bEncodingSet); } ; HandleParam(oChunk, ref state); break; case HTMLchunkType.CloseTag: HandleCloseTag(oChunk, ref state); break; case HTMLchunkType.Text: HandleText(oChunk, ref state); break; default: break; } } return(searchResult); }
public SmartMe.Core.Data.DictResult Parse(string html, Encoding encoding) { dictResult = new DictResult(); HTMLparser oP = HtmlParserFactory.GetInstance(); dictResult.DictionaryType = DictionaryType.Dict_cn; oP.Init(encoding.GetBytes(html)); oP.SetEncoding(encoding); HTMLchunk oChunk = null; int state = 0; bool bEncodingSet = false; while ((oChunk = oP.ParseNext()) != null) { switch (oChunk.oType) { case HTMLchunkType.OpenTag: HandleOpenTag(oChunk, ref state); printParams: if (oChunk.sTag == "meta") { HandleMetaEncoding(oP, oChunk, ref bEncodingSet); } ; HandleParam(oChunk, ref state); break; case HTMLchunkType.CloseTag: HandleCloseTag(oChunk, ref state); break; case HTMLchunkType.Text: HandleText(oChunk, ref state); break; default: break; } } return(dictResult); }
private static List <Protein> ParseProteins(string html) { var htmlParser = new HTMLparser(); htmlParser.Init(html); var list = new List <Protein>(); HTMLchunk chunk; while ((chunk = htmlParser.ParseNext()) != null) { // Look for an Open "tr" Tag if (chunk.oType.Equals(HTMLchunkType.OpenTag) && chunk.sTag.ToLower() == "tr") { var p = new Protein(); int projectNumber; if (Int32.TryParse(GetNextTdValue(htmlParser), NumberStyles.Integer, CultureInfo.InvariantCulture, out projectNumber)) { p.ProjectNumber = projectNumber; } else { continue; } p.ServerIP = GetNextTdValue(htmlParser); p.WorkUnitName = GetNextTdValue(htmlParser); p.NumberOfAtoms = ToInt32OrDefault(GetNextTdValue(htmlParser)); p.PreferredDays = ToDoubleOrDefault(GetNextTdValue(htmlParser)); p.MaximumDays = ToDoubleOrDefault(GetNextTdValue(htmlParser)); p.Credit = ToDoubleOrDefault(GetNextTdValue(htmlParser)); p.Frames = ToInt32OrDefault(GetNextTdValue(htmlParser)); p.Core = GetNextTdValue(htmlParser); p.Description = GetNextTdValue(htmlParser, "href"); p.Contact = GetNextTdValue(htmlParser); p.KFactor = ToDoubleOrDefault(GetNextTdValue(htmlParser)); list.Add(p); } } return(list); }
public void Open(Session session) { String charset = "utf-8"; Parser = new HTMLparser(); try { if (Utility.IsResponseHtml(session) || Utility.IsResponseXml(session)) { Parser.Init(session.responseBodyBytes == null ? new byte[] { } : session.responseBodyBytes); Parser.bAutoKeepScripts = true; Parser.bEnableHeuristics = false; // When bAutoExtractBetweenTagsOnly is false, the parser will see attributes // in the script tags, such as <script src="mydata">. Otherwise it will not. Parser.bAutoExtractBetweenTagsOnly = true; } } catch (Exception e) { Trace.TraceWarning("Warning: UtilityHtmlParser threw an unhandled exception: {0}", e.Message); ExceptionLogger.HandleException(e); } // Get the encoding name from the HTML or HTTP charset = Utility.GetHtmlCharset(session); try { // TODO: check if the encoding is a known good before continuing!!! // See if the charset name we got is a valid system encoding name. // GetEncoding should throw an Argument ex if not. Encoding e = Encoding.GetEncoding(charset); Parser.SetEncoding(charset); } catch (ArgumentException e) { // Default to utf-8 if Parser.SetEncoding(new UTF8Encoding(false, false)); } }
public HtmlNode Parse(string html) { // Majestic12 doesn't support doctype html = dedoctype.Replace(html, ""); var builder = new HtmlBuilder(); var parser = new HTMLparser(); parser.bDecodeEntities = false; parser.SetChunkHashMode(true); parser.Init(html); var chunk = parser.ParseNext(); while (chunk != null) { switch (chunk.oType) { case HTMLchunkType.OpenTag: // if something goes wrong - ignore it if (chunk.sTag != "") { var attributes = new Dictionary <string, string>(); if (chunk.iParams != 0) { foreach (string name in chunk.oParams.Keys) { attributes.Add(name, (string)chunk.oParams[name]); } } builder.OpenTag(chunk.sTag, attributes); } break; case HTMLchunkType.Comment: builder.AddComment(chunk.oHTML); break; case HTMLchunkType.CloseTag: if (chunk.bEndClosure) { var attr = new Dictionary <string, string>(); if (chunk.iParams != 0) { foreach (string name in chunk.oParams.Keys) { attr.Add(name, (string)chunk.oParams[name]); } } builder.OpenTag(chunk.sTag, attr); builder.CloseTag(chunk.sTag); } else { builder.CloseTag(chunk.sTag); } break; case HTMLchunkType.Script: builder.AddScript(chunk.oHTML); break; case HTMLchunkType.Text: builder.AddText(chunk.oHTML); break; default: break; } chunk = parser.ParseNext(); } return(builder.Render()); }
// ============================================================================================================ // Parse and create boxes // ============================================================================================================ BlockElement ParseHtmlToBlocks(string html) { IResourceProvider provider = Service.Get <IResourceProvider>(); StyleParser styles = new StyleParser(provider); BlockElement root, currentBlock; root = currentBlock = new BlockElement("root", styles.Style); // this is the root! // if this is not HTML, do not parse tags. Otherwise search out and interpret tags. bool parseHTML = true; if (!parseHTML) { for (int i = 0; i < html.Length; i++) { currentBlock.AddAtom(new CharacterElement(styles.Style, html[i])); } } else { m_Parser.Init(html); HTMLchunk chunk; while ((chunk = ParseNext(m_Parser)) != null) { if (!(chunk.oHTML == string.Empty)) { // This is a span of text. string text = chunk.oHTML; // make sure to replace escape characters! text = EscapeCharacters.ReplaceEscapeCharacters(text); //Add the characters to the current box for (int i = 0; i < text.Length; i++) { currentBlock.AddAtom(new CharacterElement(styles.Style, text[i])); } } else { // This is a tag. interpret the tag and edit the openTags list. // It may also be an atom, in which case we should add it to the list of atoms! AElement atom = null; if (chunk.bClosure && !chunk.bEndClosure) { styles.CloseOneTag(chunk); if (currentBlock.Tag == chunk.sTag) { currentBlock = currentBlock.Parent; } } else { bool isBlockTag = false; switch (chunk.sTag) { // ==================================================================================== // Anchor elements are added to the open tag collection as HREFs. case "a": styles.InterpretHREF(chunk, null); break; // ==================================================================================== // These html elements are ignored. case "body": break; // ==================================================================================== // These html elements are blocks but can also have styles case "center": case "left": case "right": case "div": atom = new BlockElement(chunk.sTag, styles.Style); styles.ParseTag(chunk, atom); isBlockTag = true; break; // ==================================================================================== // These html elements are styles, and are added to the StyleParser. case "span": case "font": case "b": case "i": case "u": case "outline": case "big": case "basefont": case "medium": case "small": styles.ParseTag(chunk, null); break; // ==================================================================================== // These html elements are added as atoms only. They cannot impart style // onto other atoms. case "br": atom = new CharacterElement(styles.Style, '\n'); break; case "gumpimg": // draw a gump image atom = new ImageElement(styles.Style, ImageElement.ImageTypes.UI); styles.ParseTag(chunk, atom); break; case "itemimg": // draw a static image atom = new ImageElement(styles.Style, ImageElement.ImageTypes.Item); styles.ParseTag(chunk, atom); break; // ==================================================================================== // Every other element is not interpreted, but rendered as text. Easy! default: { string text = html.Substring(chunk.iChunkOffset, chunk.iChunkLength); // make sure to replace escape characters! text = EscapeCharacters.ReplaceEscapeCharacters(text); //Add the characters to the current box for (int i = 0; i < text.Length; i++) { currentBlock.AddAtom(new CharacterElement(styles.Style, text[i])); } } break; } if (atom != null) { currentBlock.AddAtom(atom); if (isBlockTag && !chunk.bEndClosure) { currentBlock = (BlockElement)atom; } } styles.CloseAnySoloTags(); } } } } return(root); }
private bool RequestSstFile() { try { DateTime dt = DateTime.Today.AddDays(-10); string getUrl = ReplaceMacros(_noaaSstGetUrl, dt); string response = ""; ServicePointManager.ServerCertificateValidationCallback += ValidateRemoteCertificate; ServicePointManager.SecurityProtocol = (SecurityProtocolType)3072; //TLS 1.2 using (WebClient wc = new WebClient()) { response = wc.DownloadString(getUrl); } HTMLparser parser = new HTMLparser(); parser.Init(response); HTMLchunk chunk = null; while ((chunk = parser.ParseNext()) != null) { if (chunk.sTag != "a") { continue; } if (chunk.oType != HTMLchunkType.OpenTag) { continue; } if (chunk.oParams == null || chunk.oParams.Count < 1) { continue; } string href = chunk.oParams["href"]?.ToString(); if (string.IsNullOrEmpty(href)) { continue; } if (href.StartsWith("ftp://")) { string file = Path.Combine(SimulationData.WorkFolder, "SST.nc"); if (File.Exists(file) == false) { using (WebClient wc = new WebClient()) { wc.DownloadFile(href, file); } } } } } catch (Exception ex) { int s = 0; } return(false); }
static public IEnumerable <XNode> ConvertNodesToXml(byte[] htmlAsBytes) { HTMLparser parser = OpenParser(); parser.Init(htmlAsBytes); XElement currentNode = new XElement("document"); HTMLchunk m12chunk = null; int xmlnsAttributeIndex = 0; string originalHtml = ""; while ((m12chunk = parser.ParseNext()) != null) { try { Debug.Assert(!m12chunk.bHashMode); // popular default for Majestic-12 setting XNode newNode = null; XElement newNodesParent = null; switch (m12chunk.oType) { case HTMLchunkType.OpenTag: // Tags are added as a child to the current tag, // except when the new tag implies the closure of // some number of ancestor tags. newNode = ParseTagNode(m12chunk, originalHtml, ref xmlnsAttributeIndex); if (newNode != null) { currentNode = FindParentOfNewNode(m12chunk, originalHtml, currentNode); newNodesParent = currentNode; newNodesParent.Add(newNode); currentNode = newNode as XElement; } break; case HTMLchunkType.CloseTag: if (m12chunk.bEndClosure) { newNode = ParseTagNode(m12chunk, originalHtml, ref xmlnsAttributeIndex); if (newNode != null) { currentNode = FindParentOfNewNode(m12chunk, originalHtml, currentNode); newNodesParent = currentNode; newNodesParent.Add(newNode); } } else { XElement nodeToClose = currentNode; string m12chunkCleanedTag = CleanupTagName(m12chunk.sTag, originalHtml); while (nodeToClose != null && nodeToClose.Name.LocalName != m12chunkCleanedTag) { nodeToClose = nodeToClose.Parent; } if (nodeToClose != null) { currentNode = nodeToClose.Parent; } Debug.Assert(currentNode != null); } break; case HTMLchunkType.Script: newNode = new XElement("script", "REMOVED"); newNodesParent = currentNode; newNodesParent.Add(newNode); break; case HTMLchunkType.Comment: newNodesParent = currentNode; if (m12chunk.sTag == "!--") { newNode = new XComment(m12chunk.oHTML); } else if (m12chunk.sTag == "![CDATA[") { newNode = new XCData(m12chunk.oHTML); } else { throw new Exception("Unrecognized comment sTag"); } newNodesParent.Add(newNode); break; case HTMLchunkType.Text: currentNode.Add(m12chunk.oHTML); break; default: break; } } catch (Exception e) { var wrappedE = new Exception("Error using Majestic12.HTMLChunk, reason: " + e.Message, e); // the original html is copied for tracing/debugging purposes originalHtml = new string(htmlAsBytes.Skip(m12chunk.iChunkOffset) .Take(m12chunk.iChunkLength) .Select(B => (char)B).ToArray()); wrappedE.Data.Add("source", originalHtml); throw wrappedE; } } while (currentNode.Parent != null) { currentNode = currentNode.Parent; } return(currentNode.Nodes()); }
/// <summary> /// Download project information from Stanford University (psummaryC.html) /// </summary> /// <param name="State">Null in this implementation</param> public void DownloadFromStanford(Object State /* null */) { DateTime Start = Debug.ExecStart; lock (this) { Preferences.PreferenceSet Prefs = Preferences.PreferenceSet.Instance; WebRequest wrq = (WebRequest)WebRequest.Create("http://vspx27.stanford.edu/psummaryC.html"); wrq.Method = WebRequestMethods.Http.Get; WebResponse wrs; StreamReader sr1; if (Prefs.UseProxy) { wrq.Proxy = new WebProxy(Prefs.ProxyServer, Prefs.ProxyPort); if (Prefs.UseProxyAuth) { wrq.Proxy.Credentials = new NetworkCredential(Prefs.ProxyUser, Prefs.ProxyPass); } } else { wrq.Proxy = null; } // TODO: Handle timeouts and errors try { wrs = (WebResponse)wrq.GetResponse(); sr1 = new StreamReader(wrs.GetResponseStream(), Encoding.ASCII); if ((wrs == null) || (sr1 == null)) { throw new IOException("The web response or stream was null"); } } catch (WebException ExWeb) { ClassLogger.LogException(LogLevel.Warn, String.Format("{0} threw WebException {1}.", Debug.FunctionName, ExWeb.Message), null); ClassLogger.Log(LogLevel.Trace, String.Format("{0} Execution Time: {1}", Debug.FunctionName, Debug.GetExecTime(Start)), ""); return; } catch (IOException ExIO) { ClassLogger.LogException(LogLevel.Warn, String.Format("{0} threw IOException {1}.", Debug.FunctionName, ExIO.Message), null); ClassLogger.Log(LogLevel.Trace, String.Format("{0} Execution Time: {1}", Debug.FunctionName, Debug.GetExecTime(Start)), ""); return; } catch (Exception Ex) { ClassLogger.LogException(LogLevel.Warn, String.Format("{0} threw WebException {1}.", Debug.FunctionName, Ex.Message), null); ClassLogger.Log(LogLevel.Trace, String.Format("{0} Execution Time: {1}", Debug.FunctionName, Debug.GetExecTime(Start)), ""); return; } HTMLparser pSummary = new HTMLparser(); String sSummaryPage = sr1.ReadToEnd(); pSummary.Init(sSummaryPage); // Locate the table HTMLchunk oChunk = null; // Parse until returned oChunk is null indicating we reached end of parsing while ((oChunk = pSummary.ParseNext()) != null) { if (oChunk.sTag.ToLower() == "tr") { Protein p = new Protein(); while (((oChunk = pSummary.ParseNext()) != null) && (oChunk.sTag.ToLower() != "td")) { ; // Do nothing! } // Skip the empty attributes oChunk = pSummary.ParseNext(); try { #region Parse Code for HTML Table // Suck out the project number p.ProjectNumber = Int32.Parse(oChunk.oHTML.ToString()); // Skip the closing tag, opening tags and attributes oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); p.ServerIP = oChunk.oHTML.ToString().Trim(); // Skip the closing tag, opening tags and attributes oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); p.WorkUnitName = oChunk.oHTML.ToString().Trim(); // Skip the closing tag, opening tags and attributes oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); p.NumAtoms = Int32.Parse(oChunk.oHTML.ToString()); // Skip the closing tag, opening tags and attributes oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); p.PreferredDays = Int32.Parse(oChunk.oHTML.ToString().Substring(0, oChunk.oHTML.IndexOf('.')).Trim()); // Skip the closing tag, opening tags and attributes oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); try { p.MaxDays = Int32.Parse(oChunk.oHTML.ToString().Substring(0, oChunk.oHTML.IndexOf('.')).Trim()); } catch { p.MaxDays = 0; } oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); p.Credit = Int32.Parse(oChunk.oHTML.ToString().Substring(0, oChunk.oHTML.IndexOf('.')).Trim()); oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); p.Frames = Int32.Parse(oChunk.oHTML.ToString().Trim()); oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); p.Core = oChunk.oHTML.ToString(); oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); p.Description = oChunk.oParams["href"].ToString(); oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); oChunk = pSummary.ParseNext(); p.Contact = oChunk.oHTML.ToString(); #endregion if (this.ContainsKey(p.ProjectNumber)) { this[p.ProjectNumber] = p; } else { this.Add(p.ProjectNumber, p); } } catch (Exception Ex) { // Ignore this row of the table - unparseable ClassLogger.LogException(LogLevel.Warn, String.Format("{0} threw exception while parsing HTML: {1}", Debug.FunctionName, Ex.Message), null); } } } if (this.Count > 0) { OnNFOUpdated(new NFOUpdatedEventArgs()); } } SaveToCSV(_LocalNFOFile); ClassLogger.Log(LogLevel.Trace, String.Format("{0} loaded {1} proteins from Stanford", Debug.FunctionName, ProteinCollection.Instance.Count), ""); ClassLogger.Log(LogLevel.Trace, String.Format("{0} Execution Time: {1}", Debug.FunctionName, Debug.GetExecTime(Start)), ""); return; }
/// <summary> /// Tests parser by parsing chunk of data and then generating HTML on the basis of parsing /// and comparing this to expected HTML: in case of any discrepancies assertion will be fired /// </summary> /// <param name="bData">Data to parse</param> /// <param name="sExpectedHTML">Expected HTML as it gets generated by this very function</param> void TestParser(byte[] bData, string sExpectedHTML) { if (sExpectedHTML == null) { return; } StringBuilder oSB = new StringBuilder(512); bool bEncodingSet = false; oP.Init(bData); // ok lets parse HTML and save the HTML that we view back into string HTMLchunk oChunk; // we don't want to use hashes as they would change order in which params are made oP.SetChunkHashMode(false); // we parse until returned oChunk is null indicating we reached end of parsing while ((oChunk = oP.ParseNext()) != null) { switch (oChunk.oType) { case HTMLchunkType.OpenTag: oSB.AppendFormat("<{0}", oChunk.sTag); PrintParams: if (oChunk.sTag.Length == 4 && oChunk.sTag == "meta") { if (!bEncodingSet) { if (HTMLparser.HandleMetaEncoding(oP, oChunk, ref bEncodingSet)) { if (bEncodingSet) { // possible Title re-encoding should happen here } } } } // commented out call to code that will do the job for you - long code below // is left to demonstrate how to access individual param values // Console.WriteLine(oChunk.GenerateParamsHTML()); if (oChunk.bHashMode) { if (oChunk.oParams.Count > 0) { foreach (string sParam in oChunk.oParams.Keys) { string sValue = oChunk.oParams[sParam].ToString(); if (sValue.Length > 0) { oSB.AppendFormat(" {0}='{1}'", sParam, oP.ChangeToEntities(sValue)); } else { oSB.AppendFormat(" {0}", sParam); } } } } else { // this is alternative method of getting params -- it may look less convinient // but it saves a LOT of CPU ticks while parsing. It makes sense when you only need // params for a few if (oChunk.iParams > 0) { for (int i = 0; i < oChunk.iParams; i++) { // here we can use exactly the same single/double quotes as they // were used on params string sValue = oChunk.sValues[i]; if (oChunk.bEntities) { sValue = oP.ChangeToEntities(sValue); } switch (oChunk.cParamChars[i]) { case (byte)' ': if (oChunk.sValues[i].Length == 0) { oSB.AppendFormat(" {0}", oChunk.sParams[i]); } else { oSB.AppendFormat(" {0}={1}", oChunk.sParams[i], sValue); } break; default: oSB.AppendFormat(" {0}={1}{2}{1}", oChunk.sParams[i], (char)oChunk.cParamChars[i], sValue); break; } } } } if (oChunk.bClosure && !oP.bAutoMarkClosedTagsWithParamsAsOpen) { oSB.Append("/>"); } else { oSB.Append(">"); } break; // matched close tag, ie </a> case HTMLchunkType.CloseTag: if (oChunk.iParams > 0) { oSB.AppendFormat("<{0}", oChunk.sTag); goto PrintParams; } else { if (oChunk.bEndClosure) { oSB.AppendFormat("<{0}/>", oChunk.sTag); } else { oSB.AppendFormat("</{0}>", oChunk.sTag); } } break; // NOTE: you have to call finalisation because it is not done for Scripts or comments // Matched data between <script></script> tags case HTMLchunkType.Script: if (!oP.bAutoKeepScripts && !oP.bKeepRawHTML) { oP.SetRawHTML(oChunk); } oSB.AppendFormat(oChunk.oHTML); if (oChunk.iParams > 0) { goto PrintParams; } break; // NOTE: you have to call finalisation because it is not done for Scripts or comments // matched HTML comment, that's stuff between <!-- and --> case HTMLchunkType.Comment: if (!oP.bAutoExtractBetweenTagsOnly) { oSB.AppendFormat("{0}", oChunk.oHTML); } else { oSB.AppendFormat("<!--{0}-->", oChunk.oHTML); } break; // matched normal text case HTMLchunkType.Text: // skip pure whitespace that we are not really interested in if (oP.bCompressWhiteSpaceBeforeTag && oChunk.oHTML.Trim().Length == 0) { continue; } oSB.AppendFormat("{0}", oChunk.bEntities ? oP.ChangeToEntities(oChunk.oHTML) : oChunk.oHTML); break; } ; } // now compare parsed HTML with the one we expect Assert.AreEqual(sExpectedHTML, oSB.ToString()); }