/// <summary> /// Parse for benchmarking purposes -- its pure test of HTML parsing object, no extra processing done here /// </summary> /// <param name="oP">Parser object</param> void BenchMarkParse(HTMLparser oP) { // parser will return us tokens called HTMLchunk -- warning DO NOT destroy it until end of parsing // because HTMLparser re-uses this object HTMLchunk oChunk = null; // we parse until returned oChunk is null indicating we reached end of parsing while ((oChunk = oP.ParseNext()) != null) { switch (oChunk.oType) { // matched open tag, ie <a href=""> case HTMLchunkType.OpenTag: break; // matched close tag, ie </a> case HTMLchunkType.CloseTag: break; // matched normal text case HTMLchunkType.Text: break; // matched HTML comment, that's stuff between <!-- and --> case HTMLchunkType.Comment: break; } ; } }
static XElement FindParentOfNewNode(Majestic12.HTMLchunk m12chunk, string originalHtml, XElement nextPotentialParent) { string m12chunkCleanedTag = CleanupTagName(m12chunk.sTag, originalHtml); XElement discoveredParent = null; // Get a list of all ancestors List <XElement> ancestors = new List <XElement>(); XElement ancestor = nextPotentialParent; while (ancestor != null) { ancestors.Add(ancestor); ancestor = ancestor.Parent; } // Check if the new tag implies a previous tag was closed. if ("form" == m12chunkCleanedTag) { discoveredParent = ancestors .Where(XE => m12chunkCleanedTag == XE.Name) .Take(1) .Select(XE => XE.Parent) .FirstOrDefault(); } else if ("td" == m12chunkCleanedTag) { discoveredParent = ancestors .TakeWhile(XE => "tr" != XE.Name) .Where(XE => m12chunkCleanedTag == XE.Name) .Take(1) .Select(XE => XE.Parent) .FirstOrDefault(); } else if ("tr" == m12chunkCleanedTag) { discoveredParent = ancestors .TakeWhile(XE => !("table" == XE.Name || "thead" == XE.Name || "tbody" == XE.Name || "tfoot" == XE.Name)) .Where(XE => m12chunkCleanedTag == XE.Name) .Take(1) .Select(XE => XE.Parent) .FirstOrDefault(); } else if ("thead" == m12chunkCleanedTag || "tbody" == m12chunkCleanedTag || "tfoot" == m12chunkCleanedTag) { discoveredParent = ancestors .TakeWhile(XE => "table" != XE.Name) .Where(XE => m12chunkCleanedTag == XE.Name) .Take(1) .Select(XE => XE.Parent) .FirstOrDefault(); } return(discoveredParent ?? nextPotentialParent); }
/// <summary> /// Internally parses tag and returns it from point when '<' was found /// </summary> /// <returns>Chunk</returns> HTMLchunk GetNextTag() { //iCurPos++; oChunk = oTP.ParseTag(ref iCurPos); // for backwards compatibility mark closed tags with params as open if (oChunk.iParams > 0 && bAutoMarkClosedTagsWithParamsAsOpen && oChunk.oType == HTMLchunkType.CloseTag) { oChunk.oType = HTMLchunkType.OpenTag; } // 012345 // check for start of script if (oChunk.sTag.Length == 6 && oChunk.sTag[0] == 's' && oChunk.sTag == "script") { if (!oChunk.bClosure) { oChunk.oType = HTMLchunkType.Script; oChunk = oTP.ParseScript(ref iCurPos); return(oChunk); } } oChunk.iChunkLength = iCurPos - oChunk.iChunkOffset; if (bKeepRawHTML) { oChunk.oHTML = oEnc.GetString(bHTML, oChunk.iChunkOffset, oChunk.iChunkLength); } return(oChunk); }
private void HandleCloseTag( HTMLchunk oChunk, ref int state ) { if ( state == 2 ) { state += 1; } else if ( state ==4 ) { state = 5; } else if ( oChunk.sTag == "ol" && ( state == 9 ) ) { state = 10; } else if ( oChunk.sTag == "table" && state == 12 ) { state = 13; } else if ( oChunk.sTag == "div" && ( state==6 || state == 14 || state == 16 ) ) { if(--divCount==0) { state +=1; } } }
private void Dispose(bool bDisposing) { if (!bDisposed) { bDisposed = true; if (oChunk != null) { oChunk.Dispose(); oChunk = null; } if (sText != null) { sText.Dispose(); sText = null; } bHTML = null; if (oE != null) { oE.Dispose(); oE = null; } if (oTP != null) { oTP.Dispose(); oTP = null; } } }
private void HandleCloseTag(HTMLchunk oChunk, ref int state) { if(oChunk.sTag=="ol") { state = 0; } else if(oChunk.sTag== "a") { if (state == 5 || state == 9 || state == 11) state += 1; } }
private void Dispose(bool bDisposing) { if (!bDisposed) { bDisposed = true; bHTML = null; oChunk = null; sText = null; oE = null; oP = null; } }
/// <summary> /// Inits tag parser /// </summary> /// <param name="p_oChunk"></param> /// <param name="p_sText"></param> internal void Init(HTMLparser p_oP, HTMLchunk p_oChunk, DynaString p_sText, byte[] p_bHTML, int p_iDataLength, HTMLentities p_oE, HTMLheuristics p_oHE) { oP = p_oP; oChunk = p_oChunk; sText = p_sText; bHTML = p_bHTML; iDataLength = p_iDataLength; // we don't want to be too close to end of data when dealing with heuristics iMaxHeuDataLength = iDataLength - MIN_DATA_SIZE_FOR_HEURISTICS; oE = p_oE; oHE = p_oHE; }
private void HandleMetaEncoding(HTMLparser oP, HTMLchunk oChunk, ref bool bEncodingSet) { // if encoding already set then we should not be trying to set new one // this is the logic that major browsers follow - the first Encoding is assumed to be // the correct one if (bEncodingSet) return; if (HTMLparser.HandleMetaEncoding(oP, oChunk, ref bEncodingSet)) { if (!bEncodingSet) Console.WriteLine("Failed to set encoding from META: {0}", oChunk.GenerateHTML()); } }
private void HandleCloseTag(HTMLchunk oChunk, ref int state) { if (oChunk.sTag == "tr") { state = 0; } else if (oChunk.sTag == "td") { state = 1; } else if (oChunk.sTag == "a") { if (state == 4 || state == 6 || state == 8) state += 1; } }
/// <summary> /// Handles META tags that set page encoding /// </summary> /// <param name="oChunk">Chunk</param> void HandleMetaEncoding(HTMLparser oP, HTMLchunk oChunk, ref bool bEncodingSet) { // if encoding already set then we should not be trying to set new one // this is the logic that major browsers follow - the first Encoding is assumed to be // the correct one if (bEncodingSet) { return; } if (HTMLparser.HandleMetaEncoding(oP, oChunk, ref bEncodingSet)) { if (!bEncodingSet) { Console.WriteLine("Failed to set encoding from META: {0}", oChunk.GenerateHTML()); } } }
private void HandleCloseTag(HTMLchunk oChunk, ref int state) { if (oChunk.sTag == "a" && state == 6) { state += 1; } else if (oChunk.sTag == "li" && state == 7) { state = 4; if (item.Url != null && item.Url != "") { searchResult.Results.Add(item); item = new SearchEngineResult.ResultItem(); } } else if (oChunk.sTag == "ul" && state == 4) { state = -1; } }
private HTMLchunk[] htmlParse(string str) { //return value ArrayList ret = new ArrayList(); //init parser Majestic12.HTMLparser parser = new Majestic12.HTMLparser(); //keep raw html because we need to reconstruct it parser.bKeepRawHTML = true; //keep text... this is for parsing just tags parser.bTextMode = true; //initialize to parse the string parser.Init(str); Majestic12.HTMLchunk chunk = null; // we parse until returned chunk is null indicating we reached end of parsing while ((chunk = parser.ParseNext()) != null) { //discard empty blocks for performance increase if (chunk.oHTML.Trim() != "") { //hard copy the chunk HTMLchunk clone = new HTMLchunk(false); clone.oHTML = String.Copy(chunk.oHTML); clone.oType = chunk.oType; clone.sTag = String.Copy(chunk.sTag); ret.Add(clone); } } parser.CleanUp(); //return string array return((HTMLchunk[])ret.ToArray(typeof(HTMLchunk))); }
private void HandleParam(HTMLchunk oChunk, ref int state) { if (oChunk.iParams > 0) { for (int i = 0; i < oChunk.iParams; i++) { switch (oChunk.cParamChars[i]) { default: if (oChunk.sValues[i] == "f" && oChunk.sParams[i] == "class" && state == 2) { state = 3; if (item.Url != null && item.Url != "") { searchResult.Results.Add(item); item = new SearchEngineResult.ResultItem(); //item.Source = "Sogou"; } } else if (oChunk.sParams[i] == "href") { if (state == 4) { item.Url = oChunk.sValues[i]; } else if (state == 7 ) { item.CacheUrl = oChunk.sValues[i]; }else if(state==10) { item.SimilarUrl = oChunk.sValues[i]; } } else if (oChunk.sParams[i] == "id" && (state == 6 || state == 9)) { if (oChunk.sValues[i].StartsWith("sogou_snapshot")) { state = 7; } else if (oChunk.sValues[i].StartsWith("sogou_sis")) { state = 10; } } break; } } } }
/// <summary> /// Inits tag parser /// </summary> /// <param name="p_oChunk"></param> /// <param name="p_sText"></param> internal void Init(HTMLparser p_oP,HTMLchunk p_oChunk,DynaString p_sText,byte[] p_bHTML,int p_iDataLength,HTMLentities p_oE,HTMLheuristics p_oHE) { oP=p_oP; oChunk=p_oChunk; sText=p_sText; bHTML=p_bHTML; iDataLength=p_iDataLength; // we don't want to be too close to end of data when dealing with heuristics iMaxHeuDataLength=iDataLength-MIN_DATA_SIZE_FOR_HEURISTICS; oE=p_oE; oHE=p_oHE; }
public void StepBack(HTMLchunk chunk) { if (chunk == null) return; CurPos = chunk.ContentPosition; CurrentChunk = chunk; }
private void HandleParam( HTMLchunk oChunk, ref int state ) { if ( oChunk.iParams > 0 ) { for ( int i = 0; i < oChunk.iParams; i++ ) { switch ( oChunk.cParamChars[i] ) { default: if ( oChunk.sValues[i] == "main_right_left" && oChunk.sParams[i] == "id" && state == 0 ) { state = 1; } else if ( oChunk.sValues[i] == "word" && oChunk.sParams[i] == "id" && state > 0 ) { state = 2; } else if ( oChunk.sValues[i] == "pron" && oChunk.sParams[i] == "id" && state ==3 ) { state = 4; } else if ( oChunk.sValues[i] == "exp_exp" && oChunk.sParams[i] == "id" && state>2 ) { state =6; divCount = 1; } else if ( oChunk.sValues[i] == "exp_eg" && oChunk.sParams[i] == "id" && state>2 ) { state = 8; } else if ( oChunk.sValues[i] == "exp_tran" && oChunk.sParams[i] == "id" && state > 2 ) { state = 11; } else if ( oChunk.sValues[i] == "exp_eee" && oChunk.sParams[i] == "id" && state > 2 ) { state = 14; divCount = 1; } else if ( oChunk.sValues[i] == "exp_baike" && oChunk.sParams[i] == "id" && state > 2 ) { state = 16; divCount = 1; } break; } } } }
private HTMLchunk[] htmlParse(string str) { //return value ArrayList ret = new ArrayList(); //init parser Majestic12.HTMLparser parser = new Majestic12.HTMLparser(); //keep raw html because we need to reconstruct it parser.bKeepRawHTML = true; //keep text... this is for parsing just tags parser.bTextMode = true; //initialize to parse the string parser.Init(str); Majestic12.HTMLchunk chunk = null; // we parse until returned chunk is null indicating we reached end of parsing while ((chunk = parser.ParseNext()) != null) { //discard empty blocks for performance increase if (chunk.oHTML.Trim() != "") { //hard copy the chunk HTMLchunk clone = new HTMLchunk(false); clone.oHTML = String.Copy(chunk.oHTML); clone.oType = chunk.oType; clone.sTag = String.Copy(chunk.sTag); ret.Add(clone); } } parser.CleanUp(); //return string array return (HTMLchunk[])ret.ToArray(typeof(HTMLchunk)); }
/// <summary> /// Cleans up parser in preparation for next parsing /// </summary> public void CleanUp() { if (Entities == null) InitEntities(); HtmlBytes = null; CurrentChunk = new HTMLchunk(true); CurPos = 0; DataLength = 0; }
private void HandleOpenTag(HTMLchunk oChunk, ref int state) { if (oChunk.sTag == "div" && state == 0) { state = 1; } else if (oChunk.sTag == "ul" && state == 2) { state = 3; } else if (oChunk.sTag == "li" && state == 4) { state = 5; } else if (oChunk.sTag == "a" && state == 5) { state = 6; } }
private static string getMarkedUpSource(HTMLchunk[] chunks, Menees.DiffUtils.EditScript edits, bool isOlderVersion) { string[] str = new string[chunks.Length]; //html encode the source so it wont render for (int i = 0; i < str.Length; i++) str[i] = System.Web.HttpUtility.HtmlEncode(chunks[i].oHTML); //get an iterator for the changes System.Collections.IEnumerator it = edits.GetEnumerator(); while (it.MoveNext()) { Menees.DiffUtils.Edit curr = (Menees.DiffUtils.Edit)it.Current; int start = (isOlderVersion ? curr.StartA : curr.StartB); switch (curr.Type) { case Menees.DiffUtils.EditType.Change: //changes are marked as deletes in older version and adds in newer version str[start] = (isOlderVersion ? Tags.changeDelete : Tags.changeAdd) + str[start]; str[start + curr.Length] += Tags.close; break; case Menees.DiffUtils.EditType.Delete: //deletes are marked in the older version if (isOlderVersion) { str[start] = Tags.delete + str[start]; str[start + curr.Length] += Tags.close; } break; case Menees.DiffUtils.EditType.Insert: //Inserts are marked in the newer version if (!isOlderVersion) { str[start] = Tags.add + str[start]; str[start + curr.Length] += Tags.close; } break; } } return String.Join("", str); }
private void HandleOpenTag(HTMLchunk oChunk, ref int state) { if (oChunk.sTag == "tr") { state = 1; } else if (oChunk.sTag == "td" && state > 0) { state = 2; } else if (oChunk.sTag == "a") { if (state == 3 || state == 5 || state == 7) state += 1; else if (state == 9) { state = 8; } } }
private void HandleParam(HTMLchunk oChunk, ref int state) { if (oChunk.iParams > 0) { for (int i = 0; i < oChunk.iParams; i++) { switch (oChunk.cParamChars[i]) { default: if (oChunk.sValues[i] == "f" && oChunk.sParams[i] == "class" && state == 2) { state = 3; if (item.Url != null && item.Url != "") { searchResult.Results.Add(item); item = new SearchEngineResult.ResultItem(); //item.Source = "Baidu"; } } else if (oChunk.sParams[i] == "href") { if (state == 4) { item.Url = oChunk.sValues[i]; } else if (state == 6 || state == 8) { if (oChunk.sValues[i].IndexOf("cache") != -1) { item.CacheUrl = oChunk.sValues[i]; } else if (oChunk.sValues[i].StartsWith("s?cl=2")) { item.SimilarUrl = oChunk.sValues[i]; } } } break; } } } }
static Regex _shortHtmlComment = new Regex(@"^<!-.*->$"); // matches "<!-Extra_Images->" static XElement ParseTagNode(Majestic12.HTMLchunk m12chunk, string originalHtml, ref int xmlnsIndex) { if (string.IsNullOrEmpty(m12chunk.sTag)) { if (m12chunk.sParams.Length > 0 && m12chunk.sParams[0].ToLower().Equals("doctype")) { return(new XElement("doctype")); } if (_weirdTag.IsMatch(originalHtml)) { return(new XElement("REMOVED_weirdBlockParenthesisTag")); } if (_aspnetPrecompiled.IsMatch(originalHtml)) { return(new XElement("REMOVED_ASPNET_PrecompiledDirective")); } if (_shortHtmlComment.IsMatch(originalHtml)) { return(new XElement("REMOVED_ShortHtmlComment")); } // Nodes like "<br <br>" will end up with a m12chunk.sTag==""... We discard these nodes. return(null); } string tagName = CleanupTagName(m12chunk.sTag, originalHtml); XElement result = new XElement(tagName); List <XAttribute> attributes = new List <XAttribute>(); for (int i = 0; i < m12chunk.iParams; i++) { if (m12chunk.sParams[i] == "<!--") { // an HTML comment was embedded within a tag. This comment and its contents // will be interpreted as attributes by Majestic-12... skip this attributes for (; i < m12chunk.iParams; i++) { if (m12chunk.sTag == "--" || m12chunk.sTag == "-->") { break; } } continue; } if (m12chunk.sParams[i] == "?" && string.IsNullOrEmpty(m12chunk.sValues[i])) { continue; } string attributeName = m12chunk.sParams[i]; if (!TryCleanupAttributeName(attributeName, ref xmlnsIndex, out attributeName)) { continue; } attributes.Add(new XAttribute(attributeName, m12chunk.sValues[i])); } // If attributes are duplicated with different values, we complain. // If attributes are duplicated with the same value, we remove all but 1. var duplicatedAttributes = attributes.GroupBy(A => A.Name).Where(G => G.Count() > 1); foreach (var duplicatedAttribute in duplicatedAttributes) { if (duplicatedAttribute.GroupBy(DA => DA.Value).Count() > 1) { throw new Exception("Attribute value was given different values"); } attributes.RemoveAll(A => A.Name == duplicatedAttribute.Key); attributes.Add(duplicatedAttribute.First()); } result.Add(attributes); return(result); }
private void HandleParam(HTMLchunk oChunk, ref int state) { if (oChunk.iParams > 0) { for (int i = 0; i < oChunk.iParams; i++) { switch (oChunk.cParamChars[i]) { default: if (oChunk.sValues[i] == "bodyContent" && oChunk.sParams[i] == "id" && state == 1) { state = 2; } else if (oChunk.sValues[i] == "mw-search-results" && oChunk.sParams[i] == "class" && state == 3) { state = 4; } else if (oChunk.sParams[i] == "href" && state == 6) { item.Url ="http://en.wikipedia.org"+ oChunk.sValues[i]; } else if (oChunk.sParams[i] == "title" && state == 6) { item.Title = oChunk.sValues[i]; } break; } } } }
/// <summary> /// Parses HTML by chunk, prints parsed data on screen and waits for ENTER to go to next chunk /// </summary> /// <param name="oP">Parser object</param> void ParseAndPrint(HTMLparser oP) { if (bReadLineDelay) { Console.WriteLine("Parsing HTML, will print each parsed chunk, press ENTER after each to continue"); } // parser will return us tokens called HTMLchunk -- warning DO NOT destroy it until end of parsing // because HTMLparser re-uses this object HTMLchunk oChunk = null; // we parse until returned oChunk is null indicating we reached end of parsing while ((oChunk = oP.ParseNext()) != null) { switch (oChunk.oType) { // matched open tag, ie <a href=""> case HTMLchunkType.OpenTag: Console.Write("Open tag: " + oChunk.sTag); // lets get params and their values // if hashmode is set then param/values are kept in Hashtable oChunk.oParams // this makes parsing slower, so if you want the highest performance then you // need to HashMode to false if (oChunk.bHashMode) { if (oChunk.oParams.Count > 0) { foreach (string sParam in oChunk.oParams.Keys) { string sValue = oChunk.oParams[sParam].ToString(); if (sValue.Length > 0) { Console.Write(" {0}='{1}'", sParam, sValue); } else { Console.Write(" {0}", sParam); } } } } else { // this is alternative method of getting params -- it may look less convinient // but it saves a LOT of CPU ticks while parsing. It makes sense when you only need // params for a few if (oChunk.iParams > 0) { for (int i = 0; i < oChunk.iParams; i++) { if (oChunk.sValues[i].Length > 0) { Console.Write(" {0}='{1}'", oChunk.sParams[i], oChunk.sValues[i]); } else { Console.Write(" {0}", oChunk.sParams[i]); } } } } break; // matched close tag, ie </a> case HTMLchunkType.CloseTag: Console.Write("Closed tag: " + oChunk.sTag); break; // matched normal text case HTMLchunkType.Text: Console.Write("Text: '{0}'", oChunk.oHTML); break; // matched HTML comment, that's stuff between <!-- and --> case HTMLchunkType.Comment: // Note: you need to call finalisation on the chunk as by default comments are // not finalised for performance reasons - if you have made parser to keep raw // HTML then you won't be needing to finalise it if (!oP.bKeepRawHTML) { oChunk.Finalise(); } Console.Write("Comment: " + oChunk.oHTML); break; } ; if (bReadLineDelay) { Console.ReadLine(); } else { Console.WriteLine(""); } } }
/// <summary> /// Internal: parses tag that started from current position /// </summary> /// <returns>HTMLchunk with tag information</returns> internal HTMLchunk ParseTag(ref int iCurPos) { /* * WARNING: this code was optimised for performance rather than for readability, * so be extremely careful at changing it -- your changes could easily result in wrongly parsed HTML * * This routine takes about 60% of CPU time, in theory its the best place to gain extra speed, * but I've spent plenty of time doing it, so it won't be easy... and if it is easy then please post * your changes for everyone to enjoy! * * * */ //bool bWhiteSpaceHere=false; //bool bParamValue=false; byte cChar=0; byte cPeek=0; // if true it means we have parsed complete tag //bool bGotTag=false; //int iEqualIdx=0; // we reach this function immediately after tag's byte (<) was // detected, so we need to save it in order to keep correct HTML copy // oChunk.Append((byte)'<'); // (byte)'<' /* oChunk.bBuffer[0]=60; oChunk.iBufPos=1; oChunk.iHTMLen=1; */ // initialise peeked char - this will point to the next after < character if(iCurPos<iDataLength) { cPeek=bHTML[iCurPos]; // in case of comments ! must follow immediately after < if(cPeek==(byte)'!') { if(iCurPos+2<iDataLength && bHTML[iCurPos+1]==(byte)'-' && bHTML[iCurPos+2]==(byte)'-') { // we detected start of comments here, instead of parsing the rest here we will // call special function tuned to do the job much more effectively oChunk.sTag="!--"; oChunk.oType=HTMLchunkType.Comment; oChunk.bComments=true; // oChunk.Append((byte)'!'); // oChunk.Append((byte)'-'); // oChunk.Append((byte)'-'); iCurPos+=3; bool bFullTag; oChunk=ParseComments(ref iCurPos,out bFullTag); oChunk.iChunkLength=iCurPos-oChunk.iChunkOffset; if(oP.bAutoKeepComments || oP.bKeepRawHTML) { if(!oP.bAutoExtractBetweenTagsOnly) oChunk.oHTML=GetString(oChunk.iChunkOffset,oChunk.iChunkLength); else { oChunk.oHTML=GetString(oChunk.iChunkOffset+4,oChunk.iChunkLength-(bFullTag ? 7 : 4)); } } return oChunk; } // ok we might have here CDATA element of XML: // ref: http://www.w3schools.com/xml/xml_cdata.asp if(iCurPos+7<iDataLength && bHTML[iCurPos+1]==(byte)'[' && bHTML[iCurPos+2]==(byte)'C' && bHTML[iCurPos+3]==(byte)'D' && bHTML[iCurPos+4]==(byte)'A' && bHTML[iCurPos+5]==(byte)'T' && bHTML[iCurPos+6]==(byte)'A' && bHTML[iCurPos+7]==(byte)'[' ) { // we detected start of comments here, instead of parsing the rest here we will // call special function tuned to do the job much more effectively oChunk.sTag="![CDATA["; oChunk.oType=HTMLchunkType.Comment; oChunk.bComments=true; // oChunk.Append((byte)'!'); // oChunk.Append((byte)'-'); // oChunk.Append((byte)'-'); iCurPos+=8; bool bFullTag; oChunk=ParseCDATA(ref iCurPos,out bFullTag); oChunk.iChunkLength=iCurPos-oChunk.iChunkOffset; if(oP.bAutoKeepComments || oP.bKeepRawHTML) { if(!oP.bAutoExtractBetweenTagsOnly) oChunk.oHTML=GetString(oChunk.iChunkOffset,oChunk.iChunkLength); else { oChunk.oHTML=GetString(oChunk.iChunkOffset+4+5, oChunk.iChunkLength-(bFullTag ? 7+5 : 4+5)); } } return oChunk; } } } else { // empty tag but its not closed, so we will call it open... oChunk.oType=HTMLchunkType.OpenTag; // end of data... before it started return oChunk; } // tag ID, non-zero if matched by heuristics engine int iTagID=0; // STAGE 0: lets try some heuristics to see if we can quickly identify most common tags // that should be present most of the time, this should save a lot of looping and string creation if(bEnableHeuristics && iCurPos<iMaxHeuDataLength) { // check if we have got closure of the tag if(cPeek==(byte)'/') { oChunk.bClosure=true; oChunk.bEndClosure=false; oChunk.oType=HTMLchunkType.CloseTag; iCurPos++; cPeek=bHTML[iCurPos]; } cChar=bHTML[iCurPos+1]; // probability of having a match is very high (or so we expect) iTagID=oHE.MatchTag(cPeek,cChar); if(iTagID!=0) { if(iTagID<0) { iTagID*=-1; // single character tag oChunk.sTag=oHE.GetString(iTagID); // see if we got fully closed tag if(cChar==(byte)'>') { iCurPos+=2; goto ReturnChunk; } cPeek=cChar; iCurPos++; // everything else means we need to continue scanning as we may have params and stuff goto AttributeParsing; } else { // ok, we have here 2 or more character string that we need to check further // often when we have full 2 char match the next char will be >, if that's the case // then we definately matched our tag byte cNextChar=bHTML[iCurPos+2]; if(cNextChar==(byte)'>') { //oChunk.sTag=oHE.GetString(iTagID); oChunk.sTag=oHE.GetTwoCharString(cPeek,cChar); iCurPos+=3; goto ReturnChunk; } // ok, check next char for space, if that's the case we still got our tag // but need to skip to attribute parsing if(cNextChar==(byte)' ') { //oChunk.sTag=oHE.GetString(iTagID); oChunk.sTag=oHE.GetTwoCharString(cPeek,cChar); iCurPos+=2; cPeek=cNextChar; goto AttributeParsing; } // ok, we are not very lucky, but it is still worth fighting for // now we need to check fully long string against what we have matched, maybe // we got exact match and we can avoid full parsing of the tag byte[] bTag=oHE.GetStringData(iTagID); if(iCurPos+bTag.Length+5>=iDataLength) goto TagParsing; // in a loop (and this is not an ideal solution, but still) for(int i=2; i<bTag.Length; i++) { // if a single char is not matched, then we if(bTag[i]!=bHTML[iCurPos+i]) { goto TagParsing; } } // ok we matched full long word, but we need to be sure that char // after the word is ' ' or '>' as otherwise we may have matched prefix of even longer // word cNextChar=bHTML[iCurPos+bTag.Length]; if(cNextChar==(byte)'>') { oChunk.sTag=oHE.GetString(iTagID); iCurPos+=bTag.Length+1; goto ReturnChunk; } if(cNextChar==(byte)' ') { cPeek=cNextChar; oChunk.sTag=oHE.GetString(iTagID); iCurPos+=bTag.Length; goto AttributeParsing; } // no luck: we need to parse tag fully as our heuristical matching failed miserably :'o( } } } TagParsing: sText.Clear(); byte bCharType=0; // STAGE 1: parse tag (anything until > or /> or whitespace leading to start of attribute) while(cPeek!=0) { bCharType=bTagCharTypes[cPeek]; //if(cPeek<=32 && bWhiteSpace[cPeek]==1) if(bCharType==(byte)TagCharType.WhiteSpace) { iCurPos++; // speculative loop unroll -- we have a very good chance of seeing non-space char next // so instead of setting up loop we will just read it directly, this should save ticks // on having to prepare while() loop if(iCurPos<iDataLength) cChar=bHTML[iCurPos++]; else cChar=0; bCharType=bTagCharTypes[cChar]; //if(cChar==' ' || cChar=='\t' || cChar==13 || cChar==10) //if(cChar<=32 && bWhiteSpace[cChar]==1) if(bCharType==(byte)TagCharType.WhiteSpace) { while(iCurPos<iDataLength) { cChar=bHTML[iCurPos++]; bCharType=bTagCharTypes[cChar]; if(bCharType==(byte)TagCharType.WhiteSpace) //if(cChar!=' ' && cChar!='\t' && cChar!=13 && cChar!=10) { //cPeek=bHTML[iCurPos]; continue; } break; } if(iCurPos>=iDataLength) cChar=0; } //bWhiteSpaceHere=true; // now, if we have already got tag it means that we are most likely // going to need to parse tag attributes if(sText.iBufPos>0) { oChunk.sTag=sText.SetToStringASCII(); // oChunk.Append((byte)' '); iCurPos--; if(iCurPos<iDataLength) cPeek=bHTML[iCurPos]; else cPeek=0; break; } } else { // reuse Peeked char from previous run //cChar=cPeek; iCurPos++; if(iCurPos<iDataLength) cChar=bHTML[iCurPos++]; else cChar=0; } if(iCurPos<iDataLength) cPeek=bHTML[iCurPos]; else cPeek=0; // most likely we should have lower-cased ASCII char if(bCharType==(byte)TagCharType.LowerCasedASCIIorDigit) { sText.bBuffer[sText.iBufPos++]=cChar; // oChunk.Append(cChar); continue; } // tag end - we did not have any params if(cChar==(byte)'>') { if(sText.iBufPos>0) oChunk.sTag=sText.SetToStringASCII(); if(!oChunk.bClosure) oChunk.oType=HTMLchunkType.OpenTag; return oChunk; } // closure of tag sign if(cChar==(byte)'/') { oChunk.bClosure=true; oChunk.bEndClosure=(sText.iBufPos>0); oChunk.oType=HTMLchunkType.CloseTag; continue; } // 03/08/08 XML support: ?xml tags - grrr if(cChar==(byte)'?') { sText.bBuffer[sText.iBufPos++]=cChar; continue; } // nope, we have got upper cased ASCII char - this seems to be LESS likely than > and / //if(cChar>=65 && cChar<=90) if(bCharType>32) { // bCharType in this case contains already lower-cased char sText.bBuffer[sText.iBufPos++]=bCharType; // oChunk.Append(bCharType); continue; } // we might have namespace : sign here - all text before would have to be // saved as namespace and we will need to continue parsing actual tag if(bCharType==(byte)TagCharType.NameSpaceColon) { // ok here we got a choice - we can just continue and treat the whole // thing as a single tag with namespace stuff prefixed, OR // we can separate first part into namespace and keep tag as normal sText.bBuffer[sText.iBufPos++]=(byte)':'; continue; } // ok, we have got some other char - we break out to deal with it in attributes part break; } if(cPeek==0) { return oChunk; } // if true then equal sign was found //bool bEqualsSign=false; // STAGE 2: parse attributes (if any available) // attribute name can be standalone or with value after = // attribute itself can't have entities or anything like this - we expect it to be in ASCII characters AttributeParsing: string sAttrName; if(iTagID!=0) { // first, skip whitespace: if(cPeek<=32 && bTagCharTypes[cPeek]==(byte)TagCharType.WhiteSpace) { // most likely next char is not-whitespace iCurPos++; if(iCurPos>=iDataLength) goto ReturnChunk; cPeek=bHTML[iCurPos]; if(cPeek<=32 && bTagCharTypes[cPeek]==(byte)TagCharType.WhiteSpace) { // ok long loop here then while(iCurPos<iDataLength) { cPeek=bHTML[iCurPos++]; if(cPeek<=32 && bTagCharTypes[cPeek]==(byte)TagCharType.WhiteSpace) continue; break; } if(cPeek==(byte)'>') goto ReturnChunk; iCurPos--; if(iCurPos>=iDataLength) goto ReturnChunk; } if(iCurPos>=iDataLength) goto ReturnChunk; } // ok we have got matched tag, it is possible that we might be able to quickly match // attribute name known to be used for that tag: int iAttrID=oHE.MatchAttr(cPeek,iTagID); if(iAttrID>0) { byte[] bAttr=oHE.GetAttrData(iAttrID); if(iCurPos+bAttr.Length+2>=iDataLength) goto ActualAttributeParsing; // in a loop (and this is not an ideal solution, but still) for(int i=1; i<bAttr.Length; i++) { // if a single char is not matched, then we if(bAttr[i]!=bHTML[iCurPos+i]) { goto ActualAttributeParsing; } } byte cNextChar=bHTML[iCurPos+bAttr.Length]; // ok, we expect next symbol to be = if(cNextChar==(byte)'=') { sAttrName=oHE.GetAttr(iAttrID); iCurPos+=bAttr.Length+1; cPeek=bHTML[iCurPos]; goto AttributeValueParsing; } } } ActualAttributeParsing: sText.Clear(); // doing exactly the same thing as in tag parsing while(cPeek!=0) { bCharType=bTagCharTypes[cPeek]; //if(cPeek<=32 && bWhiteSpace[cPeek]==1) if(bCharType==(byte)TagCharType.WhiteSpace) { iCurPos++; // speculative loop unroll -- we have a very good chance of seeing non-space char next // so instead of setting up loop we will just read it directly, this should save ticks // on having to prepare while() loop if(iCurPos<iDataLength) cChar=bHTML[iCurPos++]; else { cPeek=0; break; } bCharType=bTagCharTypes[cChar]; //if(cChar==' ' || cChar=='\t' || cChar==13 || cChar==10) //if(cChar<=32 && bWhiteSpace[cChar]==1) if(bCharType==(byte)TagCharType.WhiteSpace) { while(iCurPos<iDataLength) { cChar=bHTML[iCurPos++]; bCharType=bTagCharTypes[cChar]; if(bCharType==(byte)TagCharType.WhiteSpace) //if(cChar!=' ' && cChar!='\t' && cChar!=13 && cChar!=10) { //cPeek=bHTML[iCurPos]; continue; } //if(cChar==(byte)'>') // goto ReturnChunk; //iCurPos--; break; } if(iCurPos>=iDataLength) { cChar=0; cPeek=0; break; } } //bWhiteSpaceHere=true; // now, if we have already got attribute name it means that we need to go to parse value (which may not be present) if(sText.iBufPos>0) { // oChunk.Append((byte)' '); iCurPos--; if(iCurPos<iDataLength) cPeek=bHTML[iCurPos]; else cPeek=0; // ok, we have got attribute name and now we have got next char there // most likely we have got = here and then value if(cPeek==(byte)'=') { //bEqualsSign=true; // move forward one char iCurPos++; if(iCurPos<iDataLength) cPeek=bHTML[iCurPos]; else cPeek=0; break; } // or we can have end of tag itself, doh! if(cPeek==(byte)'>') { // move forward one char iCurPos++; if(sText.iBufPos>0) oChunk.AddParam(sText.SetToStringASCII(),"",(byte)' '); if(!oChunk.bClosure) oChunk.oType=HTMLchunkType.OpenTag; return oChunk; } // closure if(cPeek==(byte)'/') { oChunk.bClosure=true; oChunk.bEndClosure=true; oChunk.oType=HTMLchunkType.CloseTag; continue; } // ok, we have got new char starting after current attribute name is fully parsed // this means the attribute name is on its own and the char we found is start // of a new attribute oChunk.AddParam(sText.SetToStringASCII(),"",(byte)' '); sText.Clear(); goto AttributeParsing; } } else { // reuse Peeked char from previous run //cChar=cPeek; iCurPos++; if(iCurPos<iDataLength) cChar=bHTML[iCurPos++]; else cChar=0; } if(iCurPos<iDataLength) cPeek=bHTML[iCurPos]; else cPeek=0; // most likely we should have lower-cased ASCII char here if(bCharType==(byte)TagCharType.LowerCasedASCIIorDigit) { sText.bBuffer[sText.iBufPos++]=cChar; // oChunk.Append(cChar); continue; } // = with attribute value to follow if(cChar==(byte)'=') { //bEqualsSign=true; break; } // nope, we have got upper cased ASCII char - this seems to be LESS likely than > and / //if(cChar>=65 && cChar<=90) if(bCharType>32) { // bCharType in this case contains already lower-cased char sText.bBuffer[sText.iBufPos++]=bCharType; // oChunk.Append(bCharType); continue; } // tag end - we did not have any params if(cChar==(byte)'>') { if(sText.iBufPos>0) oChunk.AddParam(sText.SetToStringASCII(),"",(byte)' '); if(!oChunk.bClosure) oChunk.oType=HTMLchunkType.OpenTag; return oChunk; } // closure of tag sign if(cChar==(byte)'/') { oChunk.bClosure=true; oChunk.bEndClosure=true; oChunk.oType=HTMLchunkType.CloseTag; continue; } // some other char sText.bBuffer[sText.iBufPos++]=cChar; // oChunk.Append(cChar); } if(cPeek==0) { if(sText.iBufPos>0) oChunk.AddParam(sText.SetToStringASCII(),"",(byte)' '); if(!oChunk.bClosure) oChunk.oType=HTMLchunkType.OpenTag; return oChunk; } sAttrName=sText.SetToStringASCII(); AttributeValueParsing: /// *********************************************************************** /// STAGE 3: parse attribute value /// *********************************************************************** // the value could be just string, or in quotes (single or double) // or we can have next attribute name start, in which case we will jump back to attribute parsing // for tracking quotes purposes byte cQuotes=cPeek; int iValueStartOffset; // skip whitespace if any if(cPeek<=32 && bTagCharTypes[cPeek]==(byte)TagCharType.WhiteSpace) { iCurPos++; // speculative loop unroll -- we have a very good chance of seeing non-space char next // so instead of setting up loop we will just read it directly, this should save ticks // on having to prepare while() loop if(iCurPos<iDataLength) cPeek=bHTML[iCurPos]; else { iValueStartOffset=iCurPos-1; goto AttributeValueEnd; } //if(cChar==' ' || cChar=='\t' || cChar==13 || cChar==10) //if(cChar<=32 && bWhiteSpace[cChar]==1) if(cPeek<=32 && bTagCharTypes[cPeek]==(byte)TagCharType.WhiteSpace) { while(iCurPos<iDataLength) { cPeek=bHTML[iCurPos++]; if(cPeek<=32 && bTagCharTypes[cPeek]==(byte)TagCharType.WhiteSpace) //if(cChar!=' ' && cChar!='\t' && cChar!=13 && cChar!=10) { //cPeek=bHTML[iCurPos]; continue; } iCurPos--; break; } if(iCurPos>=iDataLength) { iValueStartOffset=iCurPos-1; goto AttributeValueEnd; } } cQuotes=cPeek; } // because we deal with VALUE of the attribute it means we can't lower-case it, // or skip whitespace (if in quotes), which in practice means that we don't need to copy // it to temporary string buffer, we can just remember starting offset and then create string from // data in bHTML // ok, first char can be one of the quote chars or something else if(cPeek!='\"' && cPeek!='\'') { iValueStartOffset=iCurPos; cQuotes=(byte)' '; // any other char here means we have value up until next whitespace or end of tag // this gives us good opportunity to scan fairly quickly without otherwise redundant // checks - this should happen fairly rarely, however loop dealing with data between quotes // will happen often enough and its best to eliminate as much stuff from it as possible //sText.bBuffer[sText.iBufPos++]=cPeek; // move to next char if(iCurPos<iDataLength) cPeek=bHTML[iCurPos++]; else { goto AttributeValueEnd; } while(cPeek!=0) { // if whitespace then we got our value and need to go back to param if(cPeek<=32 && bTagCharTypes[cPeek]==(byte)TagCharType.WhiteSpace) { oChunk.AddParam(sAttrName,GetString(iValueStartOffset,iCurPos-iValueStartOffset-1),(byte)' '); iCurPos--; goto AttributeParsing; } // end of tag? if(cPeek==(byte)'>') { //iCurPos--; break; } if(iCurPos<iDataLength) cPeek=bHTML[iCurPos++]; else { iCurPos=iDataLength+1; goto AttributeValueEnd; } } // ok we are done, add outstanding attribute oChunk.AddParam(sAttrName,GetString(iValueStartOffset,iCurPos-iValueStartOffset-1),(byte)' '); goto ReturnChunk; } // move one step forward iCurPos++; iValueStartOffset=iCurPos; if(iCurPos<iDataLength) cPeek=bHTML[iCurPos++]; else { goto AttributeValueEnd; } // attribute value parsing from between two quotes while(cPeek!=0) { // check whether we have got possible entity (can be anything starting with &) if(cPeek==38) { int iPrevPos=iCurPos; char cEntityChar=oE.CheckForEntity(bHTML,ref iCurPos,iDataLength); // restore current symbol if(cEntityChar==0) { if(iCurPos<iDataLength) cPeek=bHTML[iCurPos++]; else break; //sText.bBuffer[sText.iBufPos++]=38; //(byte)'&';; continue; } else { // okay we have got an entity, our hope of not having to copy stuff into variable // is over, we have to continue in a slower fashion :( // but thankfully this should happen very rarely, so, annoying to code, but // most codepaths will run very fast! int iPreEntLen=iPrevPos-iValueStartOffset-1; // 14/05/08 need to clear text - it contains attribute name text sText.Clear(); // copy previous data if(iPreEntLen>0) { Array.Copy(bHTML,iValueStartOffset,sText.bBuffer,0,iPreEntLen); sText.iBufPos=iPreEntLen; } // we have to skip now to next byte, since // some converted chars might well be control chars like > oChunk.bEntities=true; if(cChar==(byte)'<') oChunk.bLtEntity=true; // unless is space we will ignore it // note that this won't work if is defined as it should // byte int value of 160, rather than 32. //if(cChar!=' ') sText.Append(cEntityChar); if(iCurPos<iDataLength) cPeek=bHTML[iCurPos++]; else { goto AttributeValueEnd; } // okay, we continue here using in effect new inside loop as we might have more entities here // attribute value parsing from between two quotes while(cPeek!=0) { // check whether we have got possible entity (can be anything starting with &) if(cPeek==38) { char cNewEntityChar=oE.CheckForEntity(bHTML,ref iCurPos,iDataLength); // restore current symbol if(cNewEntityChar!=0) { if(cNewEntityChar==(byte)'<') oChunk.bLtEntity=true; sText.Append(cNewEntityChar); if(iCurPos<iDataLength) cPeek=bHTML[iCurPos++]; else goto AttributeValueEnd; continue; } } // check if is end of quotes if(cPeek==cQuotes) { // ok we finished scanning it: add param with value and then go back to param name parsing oChunk.AddParam(sAttrName,sText.SetToString(),cQuotes); if(iCurPos<iDataLength) cPeek=bHTML[iCurPos]; else break; goto AttributeParsing; } sText.bBuffer[sText.iBufPos++]=cPeek; //sText.Append(cPeek); if(iCurPos<iDataLength) cPeek=bHTML[iCurPos++]; else break; } oChunk.AddParam(sAttrName,sText.SetToString(),cQuotes); goto ReturnChunk; } } // check if is end of quotes if(cPeek==cQuotes) { // ok we finished scanning it: add param with value and then go back to param name parsing //sText.Clear(); oChunk.AddParam(sAttrName,GetString(iValueStartOffset,iCurPos-iValueStartOffset-1),cQuotes); if(iCurPos<iDataLength) cPeek=bHTML[iCurPos]; else { //iCurPos++; break; } goto AttributeParsing; } if(iCurPos<iDataLength) cPeek=bHTML[iCurPos++]; else { //iCurPos++; break; } } AttributeValueEnd: // ok we are done, add outstanding attribute int iLen=iCurPos-iValueStartOffset-1; if(iLen>0) oChunk.AddParam(sAttrName,GetString(iValueStartOffset,iLen),cQuotes); else oChunk.AddParam(sAttrName,"",cQuotes); ReturnChunk: if(oChunk.bClosure) { oChunk.oType=HTMLchunkType.CloseTag; } else oChunk.oType=HTMLchunkType.OpenTag; return oChunk; }
/// <summary> /// Sets oHTML variable in a chunk to the raw HTML that was parsed for that chunk. /// </summary> /// <param name="oChunk">Chunk returned by ParseNext function, it must belong to the same HTMLparser that /// was initiated with the same HTML data that this chunk belongs to</param> public void SetRawHTML(HTMLchunk oChunk) { // note: this really should have been byte array assigned rather than string // it would be more correct originality-wise oChunk.oHTML = oEnc.GetString(bHTML, oChunk.iChunkOffset, oChunk.iChunkLength); }
private static string getMarkedUpHtml(HTMLchunk[] chunks, Menees.DiffUtils.EditScript edits, bool isOlderVersion) { string[] str = new string[chunks.Length]; for (int i = 0; i < str.Length; i++) str[i] = chunks[i].oHTML; //get an iterator for the changes System.Collections.IEnumerator it = edits.GetEnumerator(); //for now only mark up text nodes!!! this needs improvement while (it.MoveNext()) { Menees.DiffUtils.Edit curr = (Menees.DiffUtils.Edit)it.Current; int start = (isOlderVersion ? curr.StartA : curr.StartB); switch (curr.Type) { case Menees.DiffUtils.EditType.Change: for (int i = 0; i < curr.Length; i++) if (chunks[start + i].oType == HTMLchunkType.Text) str[start + i] = (isOlderVersion ? Tags.changeDelete : Tags.changeAdd) + str[start + i] + Tags.close; break; case Menees.DiffUtils.EditType.Delete: //deletes are marked in the older version if (isOlderVersion) for (int i = 0; i < curr.Length; i++) if (chunks[start + i].oType == HTMLchunkType.Text) str[start + i] = Tags.delete + str[start + i] + Tags.close; break; case Menees.DiffUtils.EditType.Insert: //Inserts are marked in the newer version if (!isOlderVersion) for (int i = 0; i < curr.Length; i++) if (chunks[start + i].oType == HTMLchunkType.Text) str[start + i] = Tags.add + str[start + i] + Tags.close; break; } } return String.Join("", str); }
private void HandleText(HTMLchunk oChunk, ref int state) { if (state == 5) { item.Title += oChunk.oHTML; } else if (state == 7) { item.Description += oChunk.oHTML; } }
private int[] hash(HTMLchunk[] chunks) { //return value int[] hash = new int[chunks.Length]; //hash the chunks Menees.DiffUtils.StringHasher hasher = new Menees.DiffUtils.StringHasher(Menees.DiffUtils.HashType.CRC32, true, true, 0); for (int i = 0; i < chunks.Length; i++) hash[i] = hasher.GetHashCode(chunks[i].oHTML); return hash; }
/// <summary> /// Handles META tags that set page encoding /// </summary> /// <param name="oP">HTML parser object that is used for parsing</param> /// <param name="oChunk">Parsed chunk that should contain tag META</param> /// <param name="bEncodingSet">Your own flag that shows whether encoding was already set or not, if set /// once then it should not be changed - this is the logic applied by major browsers</param> /// <returns>True if this was META tag setting Encoding, false otherwise</returns> public static bool HandleMetaEncoding(HTMLparser oP, HTMLchunk oChunk, ref bool bEncodingSet) { if (oChunk.sTag.Length != 4 || oChunk.sTag[0] != 'm' || oChunk.sTag != "meta") { return(false); } // if we do not use hashmode already then we call conversion explicitly // this is slow, but METAs are very rare so performance penalty is low if (!oChunk.bHashMode) { oChunk.ConvertParamsToHash(); } string sKey = oChunk.oParams["http-equiv"] as string; if (sKey != null) { // FIXIT: even though this is happening rare I really don't like lower casing stuff // that most likely would not need to be - if you feel bored then rewrite this bit // to make it faster, it is really easy... switch (sKey.ToLower()) { case "content-type": // rare case (appears to work in IE) reported to exist in some pages by Martin Bächtold case "content-category": // we might have charset here that may hint at necessity to decode page // check for possible encoding change // once encoding is set it should not be changed, but you can be damn // sure there are web pages out there that do that!!! if (!bEncodingSet) { string sData = oChunk.oParams["content"] as string; // it is possible we have broken META tag without Content part if (sData != null) { if (oP.SetEncoding(sData)) { // we may need to re-encode title if (!bEncodingSet) { // here you need to reencode any text that you found so far // most likely it will be just TITLE, the rest can be ignored anyway bEncodingSet = true; } } else { // failed to set encoding - most likely encoding string // was incorrect or your machine lacks codepages or something // else - might be good idea to put warning message here } } } return(true); default: break; } ; } return(false); }
private static string getKewords(HTMLchunk[] chunks, Menees.DiffUtils.EditScript edits) { System.Text.StringBuilder sb = new System.Text.StringBuilder(); //get an iterator for the changes System.Collections.IEnumerator it = edits.GetEnumerator(); while (it.MoveNext()) { Menees.DiffUtils.Edit curr = (Menees.DiffUtils.Edit)it.Current; //append only new text additions to versionB if (curr.Type == EditType.Insert || curr.Type == EditType.Change) for (int i = 0; i < curr.Length; i++) //append only text changes if (chunks[curr.StartB + i].oType == HTMLchunkType.Text) sb.Append(" " + chunks[curr.StartB + i].oHTML); } return sb.ToString(); }
/// <summary> /// Internal: parses tag that started from current position /// </summary> /// <returns>HTMLchunk with tag information</returns> internal HTMLchunk ParseTag(ref int iCurPos) { /* * WARNING: this code was optimised for performance rather than for readability, * so be extremely careful at changing it -- your changes could easily result in wrongly parsed HTML * * This routine takes about 60% of CPU time, in theory its the best place to gain extra speed, * but I've spent plenty of time doing it, so it won't be easy... and if it is easy then please post * your changes for everyone to enjoy! * * * */ //bool bWhiteSpaceHere=false; //bool bParamValue=false; byte cChar = 0; byte cPeek = 0; // if true it means we have parsed complete tag //bool bGotTag=false; //int iEqualIdx=0; // we reach this function immediately after tag's byte (<) was // detected, so we need to save it in order to keep correct HTML copy // oChunk.Append((byte)'<'); // (byte)'<' /* * oChunk.bBuffer[0]=60; * oChunk.iBufPos=1; * oChunk.iHTMLen=1; */ // initialise peeked char - this will point to the next after < character if (iCurPos < iDataLength) { cPeek = bHTML[iCurPos]; // in case of comments ! must follow immediately after < if (cPeek == (byte)'!') { if (iCurPos + 2 < iDataLength && bHTML[iCurPos + 1] == (byte)'-' && bHTML[iCurPos + 2] == (byte)'-') { // we detected start of comments here, instead of parsing the rest here we will // call special function tuned to do the job much more effectively oChunk.sTag = "!--"; oChunk.oType = HTMLchunkType.Comment; oChunk.bComments = true; // oChunk.Append((byte)'!'); // oChunk.Append((byte)'-'); // oChunk.Append((byte)'-'); iCurPos += 3; bool bFullTag; oChunk = ParseComments(ref iCurPos, out bFullTag); oChunk.iChunkLength = iCurPos - oChunk.iChunkOffset; if (oP.bAutoKeepComments || oP.bKeepRawHTML) { if (!oP.bAutoExtractBetweenTagsOnly) { oChunk.oHTML = GetString(oChunk.iChunkOffset, oChunk.iChunkLength); } else { oChunk.oHTML = GetString(oChunk.iChunkOffset + 4, oChunk.iChunkLength - (bFullTag ? 7 : 4)); } } return(oChunk); } // ok we might have here CDATA element of XML: // ref: http://www.w3schools.com/xml/xml_cdata.asp if (iCurPos + 7 < iDataLength && bHTML[iCurPos + 1] == (byte)'[' && bHTML[iCurPos + 2] == (byte)'C' && bHTML[iCurPos + 3] == (byte)'D' && bHTML[iCurPos + 4] == (byte)'A' && bHTML[iCurPos + 5] == (byte)'T' && bHTML[iCurPos + 6] == (byte)'A' && bHTML[iCurPos + 7] == (byte)'[' ) { // we detected start of comments here, instead of parsing the rest here we will // call special function tuned to do the job much more effectively oChunk.sTag = "![CDATA["; oChunk.oType = HTMLchunkType.Comment; oChunk.bComments = true; // oChunk.Append((byte)'!'); // oChunk.Append((byte)'-'); // oChunk.Append((byte)'-'); iCurPos += 8; bool bFullTag; oChunk = ParseCDATA(ref iCurPos, out bFullTag); oChunk.iChunkLength = iCurPos - oChunk.iChunkOffset; if (oP.bAutoKeepComments || oP.bKeepRawHTML) { if (!oP.bAutoExtractBetweenTagsOnly) { oChunk.oHTML = GetString(oChunk.iChunkOffset, oChunk.iChunkLength); } else { oChunk.oHTML = GetString(oChunk.iChunkOffset + 4 + 5, oChunk.iChunkLength - (bFullTag ? 7 + 5 : 4 + 5)); } } return(oChunk); } } } else { // empty tag but its not closed, so we will call it open... oChunk.oType = HTMLchunkType.OpenTag; // end of data... before it started return(oChunk); } // tag ID, non-zero if matched by heuristics engine int iTagID = 0; // STAGE 0: lets try some heuristics to see if we can quickly identify most common tags // that should be present most of the time, this should save a lot of looping and string creation if (bEnableHeuristics && iCurPos < iMaxHeuDataLength) { // check if we have got closure of the tag if (cPeek == (byte)'/') { oChunk.bClosure = true; oChunk.bEndClosure = false; oChunk.oType = HTMLchunkType.CloseTag; iCurPos++; cPeek = bHTML[iCurPos]; } cChar = bHTML[iCurPos + 1]; // probability of having a match is very high (or so we expect) iTagID = oHE.MatchTag(cPeek, cChar); if (iTagID != 0) { if (iTagID < 0) { iTagID *= -1; // single character tag oChunk.sTag = oHE.GetString(iTagID); // see if we got fully closed tag if (cChar == (byte)'>') { iCurPos += 2; goto ReturnChunk; } cPeek = cChar; iCurPos++; // everything else means we need to continue scanning as we may have params and stuff goto AttributeParsing; } else { // ok, we have here 2 or more character string that we need to check further // often when we have full 2 char match the next char will be >, if that's the case // then we definately matched our tag byte cNextChar = bHTML[iCurPos + 2]; if (cNextChar == (byte)'>') { //oChunk.sTag=oHE.GetString(iTagID); oChunk.sTag = oHE.GetTwoCharString(cPeek, cChar); iCurPos += 3; goto ReturnChunk; } // ok, check next char for space, if that's the case we still got our tag // but need to skip to attribute parsing if (cNextChar == (byte)' ') { //oChunk.sTag=oHE.GetString(iTagID); oChunk.sTag = oHE.GetTwoCharString(cPeek, cChar); iCurPos += 2; cPeek = cNextChar; goto AttributeParsing; } // ok, we are not very lucky, but it is still worth fighting for // now we need to check fully long string against what we have matched, maybe // we got exact match and we can avoid full parsing of the tag byte[] bTag = oHE.GetStringData(iTagID); if (iCurPos + bTag.Length + 5 >= iDataLength) { goto TagParsing; } // in a loop (and this is not an ideal solution, but still) for (int i = 2; i < bTag.Length; i++) { // if a single char is not matched, then we if (bTag[i] != bHTML[iCurPos + i]) { goto TagParsing; } } // ok we matched full long word, but we need to be sure that char // after the word is ' ' or '>' as otherwise we may have matched prefix of even longer // word cNextChar = bHTML[iCurPos + bTag.Length]; if (cNextChar == (byte)'>') { oChunk.sTag = oHE.GetString(iTagID); iCurPos += bTag.Length + 1; goto ReturnChunk; } if (cNextChar == (byte)' ') { cPeek = cNextChar; oChunk.sTag = oHE.GetString(iTagID); iCurPos += bTag.Length; goto AttributeParsing; } // no luck: we need to parse tag fully as our heuristical matching failed miserably :'o( } } } TagParsing: sText.Clear(); byte bCharType = 0; // STAGE 1: parse tag (anything until > or /> or whitespace leading to start of attribute) while (cPeek != 0) { bCharType = bTagCharTypes[cPeek]; //if(cPeek<=32 && bWhiteSpace[cPeek]==1) if (bCharType == (byte)TagCharType.WhiteSpace) { iCurPos++; // speculative loop unroll -- we have a very good chance of seeing non-space char next // so instead of setting up loop we will just read it directly, this should save ticks // on having to prepare while() loop if (iCurPos < iDataLength) { cChar = bHTML[iCurPos++]; } else { cChar = 0; } bCharType = bTagCharTypes[cChar]; //if(cChar==' ' || cChar=='\t' || cChar==13 || cChar==10) //if(cChar<=32 && bWhiteSpace[cChar]==1) if (bCharType == (byte)TagCharType.WhiteSpace) { while (iCurPos < iDataLength) { cChar = bHTML[iCurPos++]; bCharType = bTagCharTypes[cChar]; if (bCharType == (byte)TagCharType.WhiteSpace) //if(cChar!=' ' && cChar!='\t' && cChar!=13 && cChar!=10) { //cPeek=bHTML[iCurPos]; continue; } break; } if (iCurPos >= iDataLength) { cChar = 0; } } //bWhiteSpaceHere=true; // now, if we have already got tag it means that we are most likely // going to need to parse tag attributes if (sText.iBufPos > 0) { oChunk.sTag = sText.SetToStringASCII(); // oChunk.Append((byte)' '); iCurPos--; if (iCurPos < iDataLength) { cPeek = bHTML[iCurPos]; } else { cPeek = 0; } break; } } else { // reuse Peeked char from previous run //cChar=cPeek; iCurPos++; if (iCurPos < iDataLength) { cChar = bHTML[iCurPos++]; } else { cChar = 0; } } if (iCurPos < iDataLength) { cPeek = bHTML[iCurPos]; } else { cPeek = 0; } // most likely we should have lower-cased ASCII char if (bCharType == (byte)TagCharType.LowerCasedASCIIorDigit) { sText.bBuffer[sText.iBufPos++] = cChar; // oChunk.Append(cChar); continue; } // tag end - we did not have any params if (cChar == (byte)'>') { if (sText.iBufPos > 0) { oChunk.sTag = sText.SetToStringASCII(); } if (!oChunk.bClosure) { oChunk.oType = HTMLchunkType.OpenTag; } return(oChunk); } // closure of tag sign if (cChar == (byte)'/') { oChunk.bClosure = true; oChunk.bEndClosure = (sText.iBufPos > 0); oChunk.oType = HTMLchunkType.CloseTag; continue; } // 03/08/08 XML support: ?xml tags - grrr if (cChar == (byte)'?') { sText.bBuffer[sText.iBufPos++] = cChar; continue; } // nope, we have got upper cased ASCII char - this seems to be LESS likely than > and / //if(cChar>=65 && cChar<=90) if (bCharType > 32) { // bCharType in this case contains already lower-cased char sText.bBuffer[sText.iBufPos++] = bCharType; // oChunk.Append(bCharType); continue; } // we might have namespace : sign here - all text before would have to be // saved as namespace and we will need to continue parsing actual tag if (bCharType == (byte)TagCharType.NameSpaceColon) { // ok here we got a choice - we can just continue and treat the whole // thing as a single tag with namespace stuff prefixed, OR // we can separate first part into namespace and keep tag as normal sText.bBuffer[sText.iBufPos++] = (byte)':'; continue; } // ok, we have got some other char - we break out to deal with it in attributes part break; } if (cPeek == 0) { return(oChunk); } // if true then equal sign was found //bool bEqualsSign=false; // STAGE 2: parse attributes (if any available) // attribute name can be standalone or with value after = // attribute itself can't have entities or anything like this - we expect it to be in ASCII characters AttributeParsing: string sAttrName; if (iTagID != 0) { // first, skip whitespace: if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace) { // most likely next char is not-whitespace iCurPos++; if (iCurPos >= iDataLength) { goto ReturnChunk; } cPeek = bHTML[iCurPos]; if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace) { // ok long loop here then while (iCurPos < iDataLength) { cPeek = bHTML[iCurPos++]; if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace) { continue; } break; } if (cPeek == (byte)'>') { goto ReturnChunk; } iCurPos--; if (iCurPos >= iDataLength) { goto ReturnChunk; } } if (iCurPos >= iDataLength) { goto ReturnChunk; } } // ok we have got matched tag, it is possible that we might be able to quickly match // attribute name known to be used for that tag: int iAttrID = oHE.MatchAttr(cPeek, iTagID); if (iAttrID > 0) { byte[] bAttr = oHE.GetAttrData(iAttrID); if (iCurPos + bAttr.Length + 2 >= iDataLength) { goto ActualAttributeParsing; } // in a loop (and this is not an ideal solution, but still) for (int i = 1; i < bAttr.Length; i++) { // if a single char is not matched, then we if (bAttr[i] != bHTML[iCurPos + i]) { goto ActualAttributeParsing; } } byte cNextChar = bHTML[iCurPos + bAttr.Length]; // ok, we expect next symbol to be = if (cNextChar == (byte)'=') { sAttrName = oHE.GetAttr(iAttrID); iCurPos += bAttr.Length + 1; cPeek = bHTML[iCurPos]; goto AttributeValueParsing; } } } ActualAttributeParsing: sText.Clear(); // doing exactly the same thing as in tag parsing while (cPeek != 0) { bCharType = bTagCharTypes[cPeek]; //if(cPeek<=32 && bWhiteSpace[cPeek]==1) if (bCharType == (byte)TagCharType.WhiteSpace) { iCurPos++; // speculative loop unroll -- we have a very good chance of seeing non-space char next // so instead of setting up loop we will just read it directly, this should save ticks // on having to prepare while() loop if (iCurPos < iDataLength) { cChar = bHTML[iCurPos++]; } else { cPeek = 0; break; } bCharType = bTagCharTypes[cChar]; //if(cChar==' ' || cChar=='\t' || cChar==13 || cChar==10) //if(cChar<=32 && bWhiteSpace[cChar]==1) if (bCharType == (byte)TagCharType.WhiteSpace) { while (iCurPos < iDataLength) { cChar = bHTML[iCurPos++]; bCharType = bTagCharTypes[cChar]; if (bCharType == (byte)TagCharType.WhiteSpace) //if(cChar!=' ' && cChar!='\t' && cChar!=13 && cChar!=10) { //cPeek=bHTML[iCurPos]; continue; } //if(cChar==(byte)'>') // goto ReturnChunk; //iCurPos--; break; } if (iCurPos >= iDataLength) { cChar = 0; cPeek = 0; break; } } //bWhiteSpaceHere=true; // now, if we have already got attribute name it means that we need to go to parse value (which may not be present) if (sText.iBufPos > 0) { // oChunk.Append((byte)' '); iCurPos--; if (iCurPos < iDataLength) { cPeek = bHTML[iCurPos]; } else { cPeek = 0; } // ok, we have got attribute name and now we have got next char there // most likely we have got = here and then value if (cPeek == (byte)'=') { //bEqualsSign=true; // move forward one char iCurPos++; if (iCurPos < iDataLength) { cPeek = bHTML[iCurPos]; } else { cPeek = 0; } break; } // or we can have end of tag itself, doh! if (cPeek == (byte)'>') { // move forward one char iCurPos++; if (sText.iBufPos > 0) { oChunk.AddParam(sText.SetToStringASCII(), "", (byte)' '); } if (!oChunk.bClosure) { oChunk.oType = HTMLchunkType.OpenTag; } return(oChunk); } // closure if (cPeek == (byte)'/') { oChunk.bClosure = true; oChunk.bEndClosure = true; oChunk.oType = HTMLchunkType.CloseTag; continue; } // ok, we have got new char starting after current attribute name is fully parsed // this means the attribute name is on its own and the char we found is start // of a new attribute oChunk.AddParam(sText.SetToStringASCII(), "", (byte)' '); sText.Clear(); goto AttributeParsing; } } else { // reuse Peeked char from previous run //cChar=cPeek; iCurPos++; if (iCurPos < iDataLength) { cChar = bHTML[iCurPos++]; } else { cChar = 0; } } if (iCurPos < iDataLength) { cPeek = bHTML[iCurPos]; } else { cPeek = 0; } // most likely we should have lower-cased ASCII char here if (bCharType == (byte)TagCharType.LowerCasedASCIIorDigit) { sText.bBuffer[sText.iBufPos++] = cChar; // oChunk.Append(cChar); continue; } // = with attribute value to follow if (cChar == (byte)'=') { //bEqualsSign=true; break; } // nope, we have got upper cased ASCII char - this seems to be LESS likely than > and / //if(cChar>=65 && cChar<=90) if (bCharType > 32) { // bCharType in this case contains already lower-cased char sText.bBuffer[sText.iBufPos++] = bCharType; // oChunk.Append(bCharType); continue; } // tag end - we did not have any params if (cChar == (byte)'>') { if (sText.iBufPos > 0) { oChunk.AddParam(sText.SetToStringASCII(), "", (byte)' '); } if (!oChunk.bClosure) { oChunk.oType = HTMLchunkType.OpenTag; } return(oChunk); } // closure of tag sign if (cChar == (byte)'/') { oChunk.bClosure = true; oChunk.bEndClosure = true; oChunk.oType = HTMLchunkType.CloseTag; continue; } // some other char sText.bBuffer[sText.iBufPos++] = cChar; // oChunk.Append(cChar); } if (cPeek == 0) { if (sText.iBufPos > 0) { oChunk.AddParam(sText.SetToStringASCII(), "", (byte)' '); } if (!oChunk.bClosure) { oChunk.oType = HTMLchunkType.OpenTag; } return(oChunk); } sAttrName = sText.SetToStringASCII(); AttributeValueParsing: /// *********************************************************************** /// STAGE 3: parse attribute value /// *********************************************************************** // the value could be just string, or in quotes (single or double) // or we can have next attribute name start, in which case we will jump back to attribute parsing // for tracking quotes purposes byte cQuotes = cPeek; int iValueStartOffset; // skip whitespace if any if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace) { iCurPos++; // speculative loop unroll -- we have a very good chance of seeing non-space char next // so instead of setting up loop we will just read it directly, this should save ticks // on having to prepare while() loop if (iCurPos < iDataLength) { cPeek = bHTML[iCurPos]; } else { iValueStartOffset = iCurPos - 1; goto AttributeValueEnd; } //if(cChar==' ' || cChar=='\t' || cChar==13 || cChar==10) //if(cChar<=32 && bWhiteSpace[cChar]==1) if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace) { while (iCurPos < iDataLength) { cPeek = bHTML[iCurPos++]; if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace) //if(cChar!=' ' && cChar!='\t' && cChar!=13 && cChar!=10) { //cPeek=bHTML[iCurPos]; continue; } iCurPos--; break; } if (iCurPos >= iDataLength) { iValueStartOffset = iCurPos - 1; goto AttributeValueEnd; } } cQuotes = cPeek; } // because we deal with VALUE of the attribute it means we can't lower-case it, // or skip whitespace (if in quotes), which in practice means that we don't need to copy // it to temporary string buffer, we can just remember starting offset and then create string from // data in bHTML // ok, first char can be one of the quote chars or something else if (cPeek != '\"' && cPeek != '\'') { iValueStartOffset = iCurPos; cQuotes = (byte)' '; // any other char here means we have value up until next whitespace or end of tag // this gives us good opportunity to scan fairly quickly without otherwise redundant // checks - this should happen fairly rarely, however loop dealing with data between quotes // will happen often enough and its best to eliminate as much stuff from it as possible //sText.bBuffer[sText.iBufPos++]=cPeek; // move to next char if (iCurPos < iDataLength) { cPeek = bHTML[iCurPos++]; } else { goto AttributeValueEnd; } while (cPeek != 0) { // if whitespace then we got our value and need to go back to param if (cPeek <= 32 && bTagCharTypes[cPeek] == (byte)TagCharType.WhiteSpace) { oChunk.AddParam(sAttrName, GetString(iValueStartOffset, iCurPos - iValueStartOffset - 1), (byte)' '); iCurPos--; goto AttributeParsing; } // end of tag? if (cPeek == (byte)'>') { //iCurPos--; break; } if (iCurPos < iDataLength) { cPeek = bHTML[iCurPos++]; } else { iCurPos = iDataLength + 1; goto AttributeValueEnd; } } // ok we are done, add outstanding attribute oChunk.AddParam(sAttrName, GetString(iValueStartOffset, iCurPos - iValueStartOffset - 1), (byte)' '); goto ReturnChunk; } // move one step forward iCurPos++; iValueStartOffset = iCurPos; if (iCurPos < iDataLength) { cPeek = bHTML[iCurPos++]; } else { goto AttributeValueEnd; } // attribute value parsing from between two quotes while (cPeek != 0) { // check whether we have got possible entity (can be anything starting with &) if (cPeek == 38) { int iPrevPos = iCurPos; char cEntityChar = oE.CheckForEntity(bHTML, ref iCurPos, iDataLength); // restore current symbol if (cEntityChar == 0) { if (iCurPos < iDataLength) { cPeek = bHTML[iCurPos++]; } else { break; } //sText.bBuffer[sText.iBufPos++]=38; //(byte)'&';; continue; } else { // okay we have got an entity, our hope of not having to copy stuff into variable // is over, we have to continue in a slower fashion :( // but thankfully this should happen very rarely, so, annoying to code, but // most codepaths will run very fast! int iPreEntLen = iPrevPos - iValueStartOffset - 1; // 14/05/08 need to clear text - it contains attribute name text sText.Clear(); // copy previous data if (iPreEntLen > 0) { Array.Copy(bHTML, iValueStartOffset, sText.bBuffer, 0, iPreEntLen); sText.iBufPos = iPreEntLen; } // we have to skip now to next byte, since // some converted chars might well be control chars like > oChunk.bEntities = true; if (cChar == (byte)'<') { oChunk.bLtEntity = true; } // unless is space we will ignore it // note that this won't work if is defined as it should // byte int value of 160, rather than 32. //if(cChar!=' ') sText.Append(cEntityChar); if (iCurPos < iDataLength) { cPeek = bHTML[iCurPos++]; } else { goto AttributeValueEnd; } // okay, we continue here using in effect new inside loop as we might have more entities here // attribute value parsing from between two quotes while (cPeek != 0) { // check whether we have got possible entity (can be anything starting with &) if (cPeek == 38) { char cNewEntityChar = oE.CheckForEntity(bHTML, ref iCurPos, iDataLength); // restore current symbol if (cNewEntityChar != 0) { if (cNewEntityChar == (byte)'<') { oChunk.bLtEntity = true; } sText.Append(cNewEntityChar); if (iCurPos < iDataLength) { cPeek = bHTML[iCurPos++]; } else { goto AttributeValueEnd; } continue; } } // check if is end of quotes if (cPeek == cQuotes) { // ok we finished scanning it: add param with value and then go back to param name parsing oChunk.AddParam(sAttrName, sText.SetToString(), cQuotes); if (iCurPos < iDataLength) { cPeek = bHTML[iCurPos]; } else { break; } goto AttributeParsing; } sText.bBuffer[sText.iBufPos++] = cPeek; //sText.Append(cPeek); if (iCurPos < iDataLength) { cPeek = bHTML[iCurPos++]; } else { break; } } oChunk.AddParam(sAttrName, sText.SetToString(), cQuotes); goto ReturnChunk; } } // check if is end of quotes if (cPeek == cQuotes) { // ok we finished scanning it: add param with value and then go back to param name parsing //sText.Clear(); oChunk.AddParam(sAttrName, GetString(iValueStartOffset, iCurPos - iValueStartOffset - 1), cQuotes); if (iCurPos < iDataLength) { cPeek = bHTML[iCurPos]; } else { //iCurPos++; break; } goto AttributeParsing; } if (iCurPos < iDataLength) { cPeek = bHTML[iCurPos++]; } else { //iCurPos++; break; } } AttributeValueEnd: // ok we are done, add outstanding attribute int iLen = iCurPos - iValueStartOffset - 1; if (iLen > 0) { oChunk.AddParam(sAttrName, GetString(iValueStartOffset, iLen), cQuotes); } else { oChunk.AddParam(sAttrName, "", cQuotes); } ReturnChunk: if (oChunk.bClosure) { oChunk.oType = HTMLchunkType.CloseTag; } else { oChunk.oType = HTMLchunkType.OpenTag; } return(oChunk); }
public HTMLchunk PeakNext() { var currPos = CurPos; var currChunk = CurrentChunk; CurrentChunk = new HTMLchunk(true); var result = ParseNext(); CurrentChunk = currChunk; CurPos = currPos; return result; }
private void HandleOpenTag(HTMLchunk oChunk, ref int state) { if (oChunk.sTag == "ol") { state = 1; } else if (oChunk.sTag == "li" && state > 0 ) { state = 2; } else if(oChunk.sTag== "a") { if (state == 4 || state == 8 || state == 10) state += 1; /*else if (state == 9) { state = 8; }*/ } }
private void HandleOpenTag( HTMLchunk oChunk, ref int state ) { if ( oChunk.sTag == "ol" && state==8 ) { state =9; } else if ( oChunk.sTag == "table" && state == 11 ) { state = 12; }else if( oChunk.sTag== "div" && ( state==6 || state== 14 || state ==16) ) { ++divCount; } }
private void Dispose(bool bDisposing) { if(!bDisposed) { bDisposed=true; bHTML=null; oChunk=null; sText=null; oE=null; oP=null; } }
private void HandleText( HTMLchunk oChunk, ref int state ) { if ( state == 2 ) { dictResult.Word += ( oChunk.oHTML.Trim(new char[] { ' ', '\t', '\r', '\n' }) ); } else if ( state == 4 ) { dictResult.Pronunciation += ( System.Web.HttpUtility.HtmlDecode( oChunk.oHTML ) ); } else if ( state == 6 ) { dictResult.ChineseExplanations += ( oChunk.oHTML.Trim(new char[] { ' ', '\t', '\r', '\n' }) ); } else if ( state == 12 ) { dictResult.Variations += ( oChunk.oHTML.Trim( new char[] { ' ', '\t', '\r', '\n' } ) ); } else if ( state == 14 ) { dictResult.EnglishExplanations += ( oChunk.oHTML.Trim( new char[] { ' ', '\t', '\r', '\n' } ) ); } else if ( state == 9 ) { dictResult.Examples += ( (oChunk.oHTML.Trim( new char[] { ' ', '\t', '\r', '\n' } ) )); } else if ( state == 16 ) { dictResult.FromEncyclopedia += ( oChunk.oHTML.Trim( new char[] { ' ', '\t', '\r', '\n' } ) ); } }
/// <summary> /// Parses HTML by chunk, prints parsed data on screen and waits for ENTER to go to next chunk /// </summary> /// <param name="oP">Parser object</param> void ParseAndPrint(HTMLparser oP) { // bReadLineDelay=false; if (bReadLineDelay) { Console.WriteLine("Parsing HTML, will print each parsed chunk, press ENTER after each to continue"); } // parser will return us tokens called HTMLchunk -- warning DO NOT destroy it until end of parsing // because HTMLparser re-uses this object HTMLchunk oChunk = null; // NOTE: bear in mind that when you deal with content which uses non-Latin chars, then you // need to ensure that correct encoding is set, this often set in HTML itself, but sometimes // only in HTTP headers for a given page - some pages use BOTH, but browsers seem to // consider HTTP header setting as more important, so it is best to behave in similar way. // See below for code that deals with META based charset setting, similarly you need to call // it here if charset is set in Content-Type header // we will track whether encoding was set or not here, this is important // because we may have to do re-encoding of text found BEFORE META tag, this typically // happens for TITLE tags only - if we had no encoding set and then had it set, then // we need to reencode it, highly annoying, but having garbage in title is even more annoying bool bEncodingSet = false; // debug: oP.SetEncoding(System.Text.Encoding.GetEncoding("iso-8859-1")); // we parse until returned oChunk is null indicating we reached end of parsing while ((oChunk = oP.ParseNext()) != null) { switch (oChunk.oType) { // matched open tag, ie <a href=""> case HTMLchunkType.OpenTag: Console.Write("Open tag: " + oChunk.sTag); // in order to set correct encoding we need to keep an eye on META tags // that hit us on what the encoding should be used, note here // that some webpages have TITLE set BEFORE meta-tags, which means you will // have to re-encode it in order to get correct representation of text PrintParams: if (oChunk.sTag.Length == 4 && oChunk.sTag == "meta") { HandleMetaEncoding(oP, oChunk, ref bEncodingSet); } ; // commented out call to code that will do the job for you - long code below // is left to demonstrate how to access individual param values // Console.WriteLine("{0}",oChunk.GenerateParamsHTML()); if (oChunk.bHashMode) { if (oChunk.oParams.Count > 0) { foreach (string sParam in oChunk.oParams.Keys) { string sValue = oChunk.oParams[sParam].ToString(); if (sValue.Length > 0) { Console.Write(" {0}='{1}'", sParam, sValue); } else { Console.Write(" {0}", sParam); } } } } else { // this is alternative method of getting params -- it may look less convinient // but it saves a LOT of CPU ticks while parsing. It makes sense when you only need // params for a few if (oChunk.iParams > 0) { for (int i = 0; i < oChunk.iParams; i++) { // here we can use exactly the same single/double quotes as they // were used on params switch (oChunk.cParamChars[i]) { case (byte)' ': if (oChunk.sValues[i].Length == 0) { Console.Write(" {0}", oChunk.sParams[i]); } else { Console.Write(" {0}={1}", oChunk.sParams[i], oChunk.sValues[i]); } break; default: Console.Write(" {0}={1}{2}{1}", oChunk.sParams[i], (char)oChunk.cParamChars[i], oChunk.sValues[i]); break; } } } } break; // matched close tag, ie </a> case HTMLchunkType.CloseTag: //Console.Write(oChunk.GenerateHTML()); Console.Write("Closed tag: " + oChunk.sTag); if (oChunk.iParams > 0) { goto PrintParams; } break; // NOTE: you have to call finalisation because it is not done for Scripts or comments // Matched data between <script></script> tags case HTMLchunkType.Script: if (!oP.bAutoKeepScripts && !oP.bKeepRawHTML) { oP.SetRawHTML(oChunk); } if (oChunk.oHTML.Length > 0) { Console.Write("Script: " + oChunk.oHTML); } else { Console.Write("Script: [ignored for performance reasons]"); } if (oChunk.iParams > 0) { goto PrintParams; } break; // NOTE: you have to call finalisation because it is not done for Scripts or comments // matched HTML comment, that's stuff between <!-- and --> case HTMLchunkType.Comment: //Console.WriteLine("{0}",oChunk.GenerateHTML()); if (oP.bKeepRawHTML || oP.bAutoKeepComments) { // by default we won't finalise automatically as comments are often // very lenghty and it is costly to create long strings when they are not // needed, ie: during indexing of text Console.Write("Comment: " + oChunk.oHTML); } else { // Even if raw HTML by default was not taken you can get it anyway by // uncommenting next line //oP.SetRawHTML(oChunk); Console.Write("Comment: [ignored for performance reasons]"); } break; // matched normal text case HTMLchunkType.Text: // skip pure whitespace that we are not really interested in if (oP.bCompressWhiteSpaceBeforeTag && oChunk.oHTML.Trim().Length == 0 && bReadLineDelay) { continue; } Console.Write("Text: '{0}'", oChunk.oHTML); break; } ; if (bReadLineDelay) { Console.ReadLine(); } else { Console.WriteLine(""); } } }
/// <summary> /// Parses next chunk and returns it with /// </summary> /// <param name="bKeepWhiteSpace">If true then whitespace will be preserved (slower)</param> /// <returns>HTMLchunk or null if end of data reached</returns> public HTMLchunk ParseNext(bool bKeepWhiteSpace) { oChunk.Clear(); oChunk.oType=HTMLchunkType.Text; bool bWhiteSpace=false; byte cChar=0x00; while(true) { if(!bKeepWhiteSpace) { //bWhiteSpace=SkipWhiteSpace(); bWhiteSpace=false; while(iCurPos<iDataLength) { cChar=bHTML[iCurPos++]; if(cChar!=' ' && cChar!='\t' && cChar!=13 && cChar!=10) { // we don't do anything because we found char that can be used down the pipeline // without need to look it up again //PutChar(); //iCurPos--; goto WhiteSpaceDone; } else bWhiteSpace=true; } break; } else { cChar=NextChar(); // we are definately done if(cChar==0) break; } WhiteSpaceDone: switch((byte)cChar) { //case '<': case 60: // we may have found text bit before getting to the tag // in which case we need to put back tag byte and return // found text first, the tag will be parsed next time if(oChunk.iBufPos>0 || bWhiteSpace) { // we will add 1 white space chars to compensate for // loss of space before tag since this space often serves as a delimiter between words if(bWhiteSpace) oChunk.Append(0x20); //PutChar(); iCurPos--; // finalise chunk if text mode is not false if(bTextMode) oChunk.Finalise(); return oChunk; } if(!bKeepRawHTML) return ParseTag(bKeepWhiteSpace); else { oChunk=ParseTag(bKeepWhiteSpace); oChunk.Finalise(); return oChunk; } /* * case 179: Console.WriteLine("Found: {0} in {1}!",(char)cChar,oChunk.oHTML.ToString()); break; */ case 13: break; case 10: if(bKeepWhiteSpace) { /* if(oChunk==null) { oChunk=new HTMLchunk(false); oChunk.oType=HTMLchunkType.Text; } */ oChunk.Append(cChar); } break; default: /* if(oChunk==null) { oChunk=new HTMLchunk(false); oChunk.oType=HTMLchunkType.Text; } */ if(bTextMode) { // check if its entity if(cChar=='&') { cChar=(byte)CheckForEntity(); // restore current symbol if(cChar==0) cChar=(byte)'&'; else { oChunk.bEntities=true; if(cChar=='<') oChunk.bLtEntity=true; } } if(bReturnSplitWords) { if(bWhiteSpace) { if(oChunk.iBufPos>0) { //PutChar(); iCurPos--; oChunk.Finalise(); return oChunk; } } else { if(char.IsPunctuation((char)cChar)) { if(oChunk.iBufPos>0) { //PutChar(); oChunk.Finalise(); return oChunk; } else break; } } } else { if(bWhiteSpace && bTextMode) oChunk.Append((byte)' '); } oChunk.Append(cChar); } break; }; } if(oChunk.iBufPos==0) return null; // it will be null if we have not found any data if(bTextMode) oChunk.Finalise(); return oChunk; }
private void HandleParam(HTMLchunk oChunk, ref int state) { if (oChunk.iParams > 0) { for (int i = 0; i < oChunk.iParams; i++) { switch (oChunk.cParamChars[i]) { default: if (oChunk.sValues[i] == "g" && oChunk.sParams[i] == "class" && state == 2) { state = 3; if (item.Url!=null && item.Url!="") { searchResult.Results.Add(item); item = new SearchEngineResult.ResultItem(); //item.Source = "Google"; } }else if(oChunk.sValues[i] == "r" && oChunk.sParams[i] == "class" && state == 3) { state = 4; }else if(oChunk.sValues[i] == "s" && oChunk.sParams[i] == "class" && state == 6) { state = 7; } else if (oChunk.sValues[i] == "gl" && oChunk.sParams[i] == "class" && state == 7) { state = 8; } else if (oChunk.sParams[i] == "href") { if (state == 5) { item.Url = oChunk.sValues[i]; } else if (state == 9 || state == 11) { if (oChunk.sValues[i].IndexOf("q=related")!=-1) { item.SimilarUrl = oChunk.sValues[i]; } else if (oChunk.sValues[i].IndexOf("q=cache") != -1) { item.CacheUrl = oChunk.sValues[i]; } } } break; } } } }