public static String GetDocumentSource(ref HTMLDocument doc, Encoding enc) { if (doc == null) { return(null); } bool IsUnicodeDetermined = false; Encoding theEncoding = enc; if (theEncoding == null) { theEncoding = Encoding.GetEncoding(0); //Windows default } if (theEncoding != Encoding.GetEncoding(0)) { //Don't try to detect unicode if we were //passed an encoding other than the default IsUnicodeDetermined = true; } // use the routine from htmlwrapper MemoryStream memstream = new MemoryStream(); ComStream cstream = new ComStream(memstream); IPersistStreamInit pStreamInit = (IPersistStreamInit)doc; pStreamInit.Save(cstream, false); StringBuilder Result = new StringBuilder(); //goto start of stream memstream.Seek(0, SeekOrigin.Begin); int iSize = 2048; byte[] bytedata = new byte[2048]; int iBOMLength = 0; while (true) { iSize = memstream.Read(bytedata, 0, bytedata.Length); if (iSize > 0) { if (!IsUnicodeDetermined) { //look for byte order mark bool IsUTF16LE = false; bool IsUTF16BE = false; bool IsUTF8 = false; bool IsBOMPresent = false; if ((bytedata[0] == 0xFF) & (bytedata[1] == 0xFE))//UTF16LE { IsUTF16LE = true; IsBOMPresent = true; } if ((bytedata[0] == 0xFE) & (bytedata[1] == 0xFF))// UTF16BE { IsUTF16BE = true; IsBOMPresent = true; } if ((bytedata[0] == 0xEF) & (bytedata[1] == 0xBB) & (bytedata[2] == 0xBF)) //UTF8 { IsUTF8 = true; IsBOMPresent = true; } //look for alternate zeroes if (!IsUTF16LE & !IsUTF16BE & !IsUTF8) { if ((bytedata[1] == 0) & (bytedata[3] == 0) & (bytedata[5] == 0) & (bytedata[7] == 0)) { IsUTF16LE = true; //best guess } } if (IsUTF16LE) { theEncoding = Encoding.Unicode; } else if (IsUTF16BE) { theEncoding = Encoding.BigEndianUnicode; } else if (IsUTF8) { theEncoding = Encoding.UTF8; } if (IsBOMPresent) { //strip out the BOM iBOMLength = theEncoding.GetPreamble().Length; } //don't repeat the test IsUnicodeDetermined = true; } Result.Append(theEncoding.GetString(bytedata, iBOMLength, iSize)); } else { break; } } memstream.Close(); return(Result.ToString()); }
public static String GetDocumentSource(ref HTMLDocument doc, Encoding enc) { if (doc == null) return null; bool IsUnicodeDetermined = false; Encoding theEncoding = enc; if (theEncoding == null) { theEncoding = Encoding.GetEncoding(0); //Windows default } if (theEncoding != Encoding.GetEncoding(0)) { //Don't try to detect unicode if we were //passed an encoding other than the default IsUnicodeDetermined = true; } // use the routine from htmlwrapper MemoryStream memstream = new MemoryStream(); ComStream cstream = new ComStream(memstream); IPersistStreamInit pStreamInit = (IPersistStreamInit)doc; pStreamInit.Save(cstream, false); StringBuilder Result = new StringBuilder(); //goto start of stream memstream.Seek(0, SeekOrigin.Begin); int iSize = 2048; byte[] bytedata = new byte[2048]; int iBOMLength = 0; while (true) { iSize = memstream.Read(bytedata, 0, bytedata.Length); if (iSize > 0) { if (!IsUnicodeDetermined) { //look for byte order mark bool IsUTF16LE = false; bool IsUTF16BE = false; bool IsUTF8 = false; bool IsBOMPresent = false; if ((bytedata[0] == 0xFF) & (bytedata[1] == 0xFE))//UTF16LE { IsUTF16LE = true; IsBOMPresent = true; } if ((bytedata[0] == 0xFE) & (bytedata[1] == 0xFF))// UTF16BE { IsUTF16BE = true; IsBOMPresent = true; } if ((bytedata[0] == 0xEF) & (bytedata[1] == 0xBB) & (bytedata[2] == 0xBF)) //UTF8 { IsUTF8 = true; IsBOMPresent = true; } //look for alternate zeroes if (!IsUTF16LE & !IsUTF16BE & !IsUTF8) { if ((bytedata[1] == 0) & (bytedata[3] == 0) & (bytedata[5] == 0) & (bytedata[7] == 0)) { IsUTF16LE = true; //best guess } } if (IsUTF16LE) { theEncoding = Encoding.Unicode; } else if (IsUTF16BE) { theEncoding = Encoding.BigEndianUnicode; } else if (IsUTF8) { theEncoding = Encoding.UTF8; } if (IsBOMPresent) { //strip out the BOM iBOMLength = theEncoding.GetPreamble().Length; } //don't repeat the test IsUnicodeDetermined = true; } Result.Append(theEncoding.GetString(bytedata, iBOMLength, iSize)); } else { break; } } memstream.Close(); return Result.ToString(); }