private void HandleParam(HTMLchunk oChunk, ref int state) { if (oChunk.iParams > 0) { for (int i = 0; i < oChunk.iParams; i++) { switch (oChunk.cParamChars[i]) { default: if (oChunk.sValues[i] == "bodyContent" && oChunk.sParams[i] == "id" && state == 1) { state = 2; } else if (oChunk.sValues[i] == "mw-search-results" && oChunk.sParams[i] == "class" && state == 3) { state = 4; } else if (oChunk.sParams[i] == "href" && state == 6) { item.Url = "http://en.wikipedia.org" + oChunk.sValues[i]; } else if (oChunk.sParams[i] == "title" && state == 6) { item.Title = oChunk.sValues[i]; } break; } } } }
static string getNPVByNm(string nn, string tv, string pn) { string rv = null; HTMLchunk m12chunk = null; while ((m12chunk = parser.ParseNext()) != null) { switch (m12chunk.oType) { case HTMLchunkType.OpenTag: if (m12chunk.sTag == nn) { if (m12chunk.GetParamValue("name") == tv) { rv = m12chunk.GetParamValue(pn); } } break; case HTMLchunkType.CloseTag: break; case HTMLchunkType.Script: break; case HTMLchunkType.Comment: break; case HTMLchunkType.Text: break; default: break; } } return(rv); }
private void HandleText(HTMLchunk oChunk, ref int state) { if (state == 7) { item.Description += oChunk.oHTML; } }
public void ParseTag(HTMLchunk chunk, AElement atom) { if (!chunk.bClosure || chunk.bEndClosure) { // create the tag and add it to the list of open tags. OpenTag tag = new OpenTag(chunk); m_OpenTags.Add(tag); // parse the tag (which will update the StyleParser's current style ParseTag(tag, atom); // if the style has changed and atom is not null, set the atom's style to the current style. if (atom != null) { atom.Style = Style; } // if this is a self-closing tag (<br/>) close it! if (chunk.bEndClosure) { CloseOneTag(chunk); } } else { CloseOneTag(chunk); } }
private void HandleText(HTMLchunk oChunk, ref int state) { if (state == 2) { dictResult.Word += (oChunk.oHTML.Trim(new char[] { ' ', '\t', '\r', '\n' })); } else if (state == 4) { dictResult.Pronunciation += (System.Web.HttpUtility.HtmlDecode(oChunk.oHTML)); } else if (state == 6) { dictResult.ChineseExplanations += (oChunk.oHTML.Trim(new char[] { ' ', '\t', '\r', '\n' })); } else if (state == 12) { dictResult.Variations += (oChunk.oHTML.Trim(new char[] { ' ', '\t', '\r', '\n' })); } else if (state == 14) { dictResult.EnglishExplanations += (oChunk.oHTML.Trim(new char[] { ' ', '\t', '\r', '\n' })); } else if (state == 9) { dictResult.Examples += ((oChunk.oHTML.Trim(new char[] { ' ', '\t', '\r', '\n' }))); } else if (state == 16) { dictResult.FromEncyclopedia += (oChunk.oHTML.Trim(new char[] { ' ', '\t', '\r', '\n' })); } }
private void Dispose(bool bDisposing) { if (!bDisposed) { bDisposed = true; if (oChunk != null) { oChunk.Dispose(); oChunk = null; } if (sText != null) { sText.Dispose(); sText = null; } bHTML = null; if (oE != null) { oE.Dispose(); oE = null; } if (oTP != null) { oTP.Dispose(); oTP = null; } } }
private IBookmarkItem ParseItem(HTMLparser parser) { BookmarkLink item = null; HTMLchunk chunk, prevChunk = parser.CurrentChunk; while ((chunk = parser.ParseNext()) != null) { if (chunk.IsOpenTag && chunk.Tag == "a") { item = new BookmarkLink(); AssignLinkAttributes(item, chunk.oParams); item.Title = GetTextOrDontMove(parser); } else if (chunk.IsOpenTag && chunk.Tag == "dd" && item != null) { item.Description = ParseDescription(parser); } else if (chunk.IsOpenTag && chunk.Tag == "h3") { var folder = new BookmarkFolder(); AssignFolderAttributes(folder, chunk.oParams); folder.Title = GetTextOrDontMove(parser); return(folder); } else if ((chunk.IsOpenTag && chunk.Tag == "dt") || chunk.Tag == "dl") { parser.StepBack(prevChunk); break; } prevChunk = chunk; } return(item); }
/// <summary> /// Mainly looks to see if user-input controls certain attributes. If the input is a URL, this attempts /// to see if the scheme or domain can be controlled. If it's not, it attempts to see if the attribute /// data starts with the user-data. /// </summary> /// <param name="parms"></param> /// <param name="body"></param> /// <param name="tag"></param> /// <param name="attribute"></param> /// <param name="requiredAttribute"></param> /// <param name="requiredAttributeValue"></param> private void CheckTags(NameValueCollection parms, HTMLchunk chunk) { string paramValue; // Loop through all attributes of the current HTML element foreach (DictionaryEntry dictEntry in chunk.oParams) { // Ignore all action events e.g. onmouseover, onclick, on* if (dictEntry.Key.ToString().ToLower().StartsWith("on")) { return; } // Loop through all values in the user-controlled parameters foreach (string param in parms) { paramValue = parms.Get(param); paramValue = Utility.ToSafeLower(paramValue); // Only look at user-controlled parameter values that are bigger than 1 character. // This is kinda lame but reduces false positives. if (paramValue.Length > 1 && dictEntry.Value.ToString().StartsWith(paramValue, StringComparison.InvariantCultureIgnoreCase)) { AssembleAlert(chunk.sTag, dictEntry.Key.ToString(), param, paramValue, dictEntry.Value.ToString()); } } } }
/// <summary> /// Internally parses tag and returns it from point when '<' was found /// </summary> /// <returns>Chunk</returns> HTMLchunk GetNextTag() { //iCurPos++; oChunk = oTP.ParseTag(ref iCurPos); // for backwards compatibility mark closed tags with params as open if (oChunk.iParams > 0 && bAutoMarkClosedTagsWithParamsAsOpen && oChunk.oType == HTMLchunkType.CloseTag) { oChunk.oType = HTMLchunkType.OpenTag; } // 012345 // check for start of script if (oChunk.sTag.Length == 6 && oChunk.sTag[0] == 's' && oChunk.sTag == "script") { if (!oChunk.bClosure) { oChunk.oType = HTMLchunkType.Script; oChunk = oTP.ParseScript(ref iCurPos); return(oChunk); } } oChunk.iChunkLength = iCurPos - oChunk.iChunkOffset; if (bKeepRawHTML) { oChunk.oHTML = oEnc.GetString(bHTML, oChunk.iChunkOffset, oChunk.iChunkLength); } return(oChunk); }
private void HandleCloseTag(HTMLchunk oChunk, ref int state) { if (state == 2) { state += 1; } else if (state == 4) { state = 5; } else if (oChunk.sTag == "ol" && (state == 9)) { state = 10; } else if (oChunk.sTag == "table" && state == 12) { state = 13; } else if (oChunk.sTag == "div" && (state == 6 || state == 14 || state == 16)) { if (--divCount == 0) { state += 1; } } }
public OpenTag(HTMLchunk chunk) { Tag = chunk.Tag; Closure = chunk.Closure; EndClosure = chunk.EndClosure; Params = new Hashtable(); foreach (DictionaryEntry entry in chunk.Params) { Params.Add(entry.Key, entry.Value); } }
private void HandleParam(HTMLchunk oChunk, ref int state) { if (oChunk.iParams > 0) { for (int i = 0; i < oChunk.iParams; i++) { switch (oChunk.cParamChars[i]) { default: if (oChunk.sValues[i] == "g" && oChunk.sParams[i] == "class" && state == 2) { state = 3; if (item.Url != null && item.Url != "") { searchResult.Results.Add(item); item = new SearchEngineResult.ResultItem(); //item.Source = "Google"; } } else if (oChunk.sValues[i] == "r" && oChunk.sParams[i] == "class" && state == 3) { state = 4; } else if (oChunk.sValues[i] == "s" && oChunk.sParams[i] == "class" && state == 6) { state = 7; } else if (oChunk.sValues[i] == "gl" && oChunk.sParams[i] == "class" && state == 7) { state = 8; } else if (oChunk.sParams[i] == "href") { if (state == 5) { item.Url = oChunk.sValues[i]; } else if (state == 9 || state == 11) { if (oChunk.sValues[i].IndexOf("q=related") != -1) { item.SimilarUrl = oChunk.sValues[i]; } else if (oChunk.sValues[i].IndexOf("q=cache") != -1) { item.CacheUrl = oChunk.sValues[i]; } } } break; } } } }
private void HandleText(HTMLchunk oChunk, ref int state) { if (state == 4) { item.Title += oChunk.oHTML; } else if (state == 5) { item.Description += oChunk.oHTML; } }
public OpenTag(HTMLchunk chunk) { sTag = chunk.sTag; bClosure = chunk.bClosure; bEndClosure = chunk.bEndClosure; oParams = new Hashtable(); foreach (DictionaryEntry entry in chunk.oParams) { oParams.Add(entry.Key, entry.Value); } }
private void HandleParam(HTMLchunk oChunk, ref int state) { if (oChunk.iParams > 0) { for (int i = 0; i < oChunk.iParams; i++) { switch (oChunk.cParamChars[i]) { default: if (oChunk.sValues[i] == "main_right_left" && oChunk.sParams[i] == "id" && state == 0) { state = 1; } else if (oChunk.sValues[i] == "word" && oChunk.sParams[i] == "id" && state > 0) { state = 2; } else if (oChunk.sValues[i] == "pron" && oChunk.sParams[i] == "id" && state == 3) { state = 4; } else if (oChunk.sValues[i] == "exp_exp" && oChunk.sParams[i] == "id" && state > 2) { state = 6; divCount = 1; } else if (oChunk.sValues[i] == "exp_eg" && oChunk.sParams[i] == "id" && state > 2) { state = 8; } else if (oChunk.sValues[i] == "exp_tran" && oChunk.sParams[i] == "id" && state > 2) { state = 11; } else if (oChunk.sValues[i] == "exp_eee" && oChunk.sParams[i] == "id" && state > 2) { state = 14; divCount = 1; } else if (oChunk.sValues[i] == "exp_baike" && oChunk.sParams[i] == "id" && state > 2) { state = 16; divCount = 1; } break; } } } }
private void HandleParam(HTMLchunk oChunk, ref int state) { if (oChunk.iParams > 0) { for (int i = 0; i < oChunk.iParams; i++) { switch (oChunk.cParamChars[i]) { default: if (oChunk.sValues[i] == "f" && oChunk.sParams[i] == "class" && state == 2) { state = 3; if (item.Url != null && item.Url != "") { searchResult.Results.Add(item); item = new SearchEngineResult.ResultItem(); //item.Source = "Sogou"; } } else if (oChunk.sParams[i] == "href") { if (state == 4) { item.Url = oChunk.sValues[i]; } else if (state == 7) { item.CacheUrl = oChunk.sValues[i]; } else if (state == 10) { item.SimilarUrl = oChunk.sValues[i]; } } else if (oChunk.sParams[i] == "id" && (state == 6 || state == 9)) { if (oChunk.sValues[i].StartsWith("sogou_snapshot")) { state = 7; } else if (oChunk.sValues[i].StartsWith("sogou_sis")) { state = 10; } } break; } } } }
private void HandleCloseTag(HTMLchunk oChunk, ref int state) { if (oChunk.sTag == "ol") { state = 0; } else if (oChunk.sTag == "a") { if (state == 5 || state == 9 || state == 11) { state += 1; } } }
private void HandleOpenTag(HTMLchunk oChunk, ref int state) { if (oChunk.sTag == "ol" && state == 8) { state = 9; } else if (oChunk.sTag == "table" && state == 11) { state = 12; } else if (oChunk.sTag == "div" && (state == 6 || state == 14 || state == 16)) { ++divCount; } }
/// <summary> /// Check the EMBED tag for its enableHtmlAccess value. /// Silverlight can be embedded using the EMBED element tag. /// See http://www.informit.com/articles/article.aspx?p=1078181 /// </summary> /// <param name="bod"></param> /// <param name="chunk"></param> private void CheckEmbedTag(HTMLchunk chunk) { if (chunk.oParams.ContainsKey("enablehtmlaccess")) { String value = chunk.oParams["enablehtmlaccess"].ToString(); CheckEnableHtmlAccessValue(value, chunk.oHTML); } //foreach (Match m in Utility.GetHtmlTags(bod, "embed")) //{ // value = Utility.ToSafeLower(Utility.GetHtmlTagAttribute(m.ToString(), "enablehtmlaccess")); // if (value != null) // CheckEnableHtmlAccessValue(value, m.ToString()); //} }
public SearchEngineResult Parse(string html, Encoding encoding) { HTMLparser oP = HtmlParserFactory.GetInstance(); searchResult = new SearchEngineResult(); searchResult.SearchEngineType = SearchEngineType.Baidu; item = new SearchEngineResult.ResultItem(); //item.Source = "Baidu"; oP.Init(encoding.GetBytes(html)); oP.SetEncoding(encoding); HTMLchunk oChunk = null; int state = 0; bool bEncodingSet = false; while ((oChunk = oP.ParseNext()) != null) { switch (oChunk.oType) { case HTMLchunkType.OpenTag: HandleOpenTag(oChunk, ref state); printParams: if (oChunk.sTag == "meta") { HandleMetaEncoding(oP, oChunk, ref bEncodingSet); } ; HandleParam(oChunk, ref state); break; case HTMLchunkType.CloseTag: HandleCloseTag(oChunk, ref state); break; case HTMLchunkType.Text: HandleText(oChunk, ref state); break; default: break; } } return(searchResult); }
public SmartMe.Core.Data.DictResult Parse(string html, Encoding encoding) { dictResult = new DictResult(); HTMLparser oP = HtmlParserFactory.GetInstance(); dictResult.DictionaryType = DictionaryType.Dict_cn; oP.Init(encoding.GetBytes(html)); oP.SetEncoding(encoding); HTMLchunk oChunk = null; int state = 0; bool bEncodingSet = false; while ((oChunk = oP.ParseNext()) != null) { switch (oChunk.oType) { case HTMLchunkType.OpenTag: HandleOpenTag(oChunk, ref state); printParams: if (oChunk.sTag == "meta") { HandleMetaEncoding(oP, oChunk, ref bEncodingSet); } ; HandleParam(oChunk, ref state); break; case HTMLchunkType.CloseTag: HandleCloseTag(oChunk, ref state); break; case HTMLchunkType.Text: HandleText(oChunk, ref state); break; default: break; } } return(dictResult); }
public void CloseOneTag(HTMLchunk chunk) { var mustRecalculateStyle = false; for (var i = _openTags.Count - 1; i >= 0; i--) { if (_openTags[i].Tag == chunk.Tag) { _openTags.RemoveAt(i); mustRecalculateStyle = true; break; } } if (mustRecalculateStyle) { RecalculateStyle(); } }
public void InterpretHREF(HTMLchunk chunk, AElement atom) { if (chunk.EndClosure) { } // solo anchor elements are meaningless. if (!chunk.Closure) { // opening a hyperlink! RecalculateStyle(); var tag = new OpenTag(chunk); _openTags.Add(tag); ParseTag(tag, atom); } else { RecalculateStyle(); // closing a hyperlink. } }
private void HandleMetaEncoding(HTMLparser oP, HTMLchunk oChunk, ref bool bEncodingSet) { // if encoding already set then we should not be trying to set new one // this is the logic that major browsers follow - the first Encoding is assumed to be // the correct one if (bEncodingSet) { return; } if (HTMLparser.HandleMetaEncoding(oP, oChunk, ref bEncodingSet)) { if (!bEncodingSet) { Console.WriteLine("Failed to set encoding from META: {0}", oChunk.GenerateHTML()); } } }
private void HandleCloseTag(HTMLchunk oChunk, ref int state) { if (oChunk.sTag == "tr") { state = 0; } else if (oChunk.sTag == "td") { state = 1; } else if (oChunk.sTag == "a") { if (state == 4 || state == 6 || state == 8) { state += 1; } } }
public void CloseOneTag(HTMLchunk chunk) { bool bMustRecalculateStyle = false; for (int i = m_OpenTags.Count - 1; i >= 0; i--) { if (m_OpenTags[i].sTag == chunk.sTag) { m_OpenTags.RemoveAt(i); bMustRecalculateStyle = true; break; } } if (bMustRecalculateStyle) { RecalculateStyle(); } }
private void HandleOpenTag(HTMLchunk oChunk, ref int state) { if (oChunk.sTag == "div" && state == 0) { state = 1; } else if (oChunk.sTag == "ul" && state == 2) { state = 3; } else if (oChunk.sTag == "li" && state == 4) { state = 5; } else if (oChunk.sTag == "a" && state == 5) { state = 6; } }
private void HandleParam(HTMLchunk oChunk, ref int state) { if (oChunk.iParams > 0) { for (int i = 0; i < oChunk.iParams; i++) { switch (oChunk.cParamChars[i]) { default: if (oChunk.sValues[i] == "f" && oChunk.sParams[i] == "class" && state == 2) { state = 3; if (item.Url != null && item.Url != "") { searchResult.Results.Add(item); item = new SearchEngineResult.ResultItem(); //item.Source = "Baidu"; } } else if (oChunk.sParams[i] == "href") { if (state == 4) { item.Url = oChunk.sValues[i]; } else if (state == 6 || state == 8) { if (oChunk.sValues[i].IndexOf("cache") != -1) { item.CacheUrl = oChunk.sValues[i]; } else if (oChunk.sValues[i].StartsWith("s?cl=2")) { item.SimilarUrl = oChunk.sValues[i]; } } } break; } } } }
private void CheckObjectTag(HTMLchunk chunk, ref UtilityHtmlParser parser) { String[] bods = null; String attr = null; String html = null; String allowScriptAccessValue = null; bool flag = false; string b = chunk.oHTML; // Check the param elements of an object element if (chunk.oParams.ContainsKey("classid")) { attr = chunk.oParams["classid"].ToString(); if ((attr == "clsid:d27cdb6e-ae6d-11cf-96b8-444553540000") || (attr == "x-shockwave-flash")) // flash clsid { allowScriptAccessValue = GetAllowScriptAccessValue(ref parser, ref flag, allowScriptAccessValue, ref html); if (flag) { CheckAllowScriptAccessValue(allowScriptAccessValue, b); } } } // Otherwise check the attributes of the object element if (chunk.oParams.ContainsKey("type")) { string type = chunk.oParams["type"].ToString(); if (Utility.ToSafeLower(type) == "application/x-shockwave-flash" && chunk.oParams.ContainsKey("allowscriptaccess")) { allowScriptAccessValue = chunk.oParams["allowscriptaccess"].ToString(); CheckAllowScriptAccessValue(allowScriptAccessValue, chunk.oHTML); } // Start looking through the param elements. else if (Utility.ToSafeLower(type) == "application/x-shockwave-flash") { allowScriptAccessValue = GetAllowScriptAccessValue(ref parser, ref flag, allowScriptAccessValue, ref html); CheckAllowScriptAccessValue(allowScriptAccessValue, html); } } }
private void HandleCloseTag(HTMLchunk oChunk, ref int state) { if (oChunk.sTag == "a" && state == 6) { state += 1; } else if (oChunk.sTag == "li" && state == 7) { state = 4; if (item.Url != null && item.Url != "") { searchResult.Results.Add(item); item = new SearchEngineResult.ResultItem(); } } else if (oChunk.sTag == "ul" && state == 4) { state = -1; } }
/// <summary> /// Sets oHTML variable in a chunk to the raw HTML that was parsed for that chunk. /// </summary> /// <param name="oChunk">Chunk returned by ParseNext function, it must belong to the same HTMLparser that /// was initiated with the same HTML data that this chunk belongs to</param> public void SetRawHTML(HTMLchunk oChunk) { // note: this really should have been byte array assigned rather than string // it would be more correct originality-wise oChunk.oHTML=oEnc.GetString(bHTML,oChunk.iChunkOffset,oChunk.iChunkLength); }
private void Dispose(bool bDisposing) { if(!bDisposed) { bDisposed=true; if(oChunk!=null) { oChunk.Dispose(); oChunk=null; } if(sText!=null) { sText.Dispose(); sText=null; } bHTML=null; if(oE!=null) { oE.Dispose(); oE=null; } if(oTP!=null) { oTP.Dispose(); oTP=null; } } }
/// <summary> /// Handles META tags that set page encoding /// </summary> /// <param name="oP">HTML parser object that is used for parsing</param> /// <param name="oChunk">Parsed chunk that should contain tag META</param> /// <param name="bEncodingSet">Your own flag that shows whether encoding was already set or not, if set /// once then it should not be changed - this is the logic applied by major browsers</param> /// <returns>True if this was META tag setting Encoding, false otherwise</returns> public static bool HandleMetaEncoding(HTMLparser oP,HTMLchunk oChunk,ref bool bEncodingSet) { if(oChunk.sTag.Length!=4 || oChunk.sTag[0]!='m' || oChunk.sTag!="meta") return false; // if we do not use hashmode already then we call conversion explicitly // this is slow, but METAs are very rare so performance penalty is low if(!oChunk.bHashMode) oChunk.ConvertParamsToHash(); string sKey=oChunk.oParams["http-equiv"] as string; if(sKey!=null) { // FIXIT: even though this is happening rare I really don't like lower casing stuff // that most likely would not need to be - if you feel bored then rewrite this bit // to make it faster, it is really easy... switch(sKey.ToLower()) { case "content-type": // rare case (appears to work in IE) reported to exist in some pages by Martin Bächtold case "content-category": // we might have charset here that may hint at necessity to decode page // check for possible encoding change // once encoding is set it should not be changed, but you can be damn // sure there are web pages out there that do that!!! if(!bEncodingSet) { string sData=oChunk.oParams["content"] as string; // it is possible we have broken META tag without Content part if(sData!=null) { if(oP.SetEncoding(sData)) { // we may need to re-encode title if(!bEncodingSet) { // here you need to reencode any text that you found so far // most likely it will be just TITLE, the rest can be ignored anyway bEncodingSet=true; } } else { // failed to set encoding - most likely encoding string // was incorrect or your machine lacks codepages or something // else - might be good idea to put warning message here } } } return true; default: break; }; } return false; }
/// <summary> /// Internally parses tag and returns it from point when '<' was found /// </summary> /// <returns>Chunk</returns> HTMLchunk GetNextTag() { //iCurPos++; oChunk=oTP.ParseTag(ref iCurPos); // for backwards compatibility mark closed tags with params as open if(oChunk.iParams>0 && bAutoMarkClosedTagsWithParamsAsOpen && oChunk.oType==HTMLchunkType.CloseTag) oChunk.oType=HTMLchunkType.OpenTag; // 012345 // check for start of script if(oChunk.sTag.Length==6 && oChunk.sTag[0]=='s' && oChunk.sTag=="script") { if(!oChunk.bClosure) { oChunk.oType=HTMLchunkType.Script; oChunk=oTP.ParseScript(ref iCurPos); return oChunk; } } oChunk.iChunkLength=iCurPos-oChunk.iChunkOffset; if(bKeepRawHTML) oChunk.oHTML=oEnc.GetString(bHTML,oChunk.iChunkOffset,oChunk.iChunkLength); return oChunk; }