public void Test_HtmlStream4() { int bufsize = 10; MemoryStream baseStream = new MemoryStream(System.Text.Encoding.ASCII.GetBytes("abcdefghijklmnopqrstuvwxyz")); HtmlStream htmlstream = new HtmlStream(baseStream, bufsize); byte[] buf = new byte[15]; // Test reading up to and beyond the cache size (we should still cache it all)... Assert.AreEqual(false, htmlstream.CanRewind); htmlstream.Read(buf, 0, buf.Length); Assert.AreEqual(true, htmlstream.CanRewind); // Seek to the beginning htmlstream.Rewind(); Assert.AreEqual(true, htmlstream.CanRewind); // Ensure the base stream is still in position (i.e. didn't get touched) Assert.AreEqual(15, baseStream.Position); // We should now reading fully from the cache byte[] buf2 = new byte[15]; htmlstream.Read(buf2, 0, 15); Assert.AreEqual("abcdefghijklmno", System.Text.Encoding.ASCII.GetString(buf2)); Assert.IsTrue(htmlstream.CanRewind, "CanRewind should be true, since we should have expanded our cache"); }
public void Test_HtmlStream3() { int bufsize = 10; MemoryStream baseStream = new MemoryStream(System.Text.Encoding.ASCII.GetBytes("abcdefghijklmnopqrstuvwxyz")); HtmlStream htmlstream = new HtmlStream(baseStream, bufsize); byte[] buf = new byte[bufsize]; // Test reading up to cache size... Assert.AreEqual(false, htmlstream.CanRewind); htmlstream.Read(buf, 0, 10); Assert.AreEqual(true, htmlstream.CanRewind); // Seek to the beginning htmlstream.Rewind(); Assert.AreEqual(true, htmlstream.CanRewind); // Ensure the base stream is still in position (i.e. didn't get touched) Assert.AreEqual(10, baseStream.Position); // We should now reading from the cache, and in this case // a little bit of the real stream... byte[] buf2 = new byte[15]; htmlstream.Read(buf2, 0, 15); Assert.AreEqual("abcdefghijklmno", System.Text.Encoding.ASCII.GetString(buf2)); Assert.IsFalse(htmlstream.CanRewind, "CanRewind should be false, since we should be past the cache"); }
/// <summary> /// Returns a TextReader that detects the underlying stream's endoding. Allows clients to stream the /// retured content using a TextReader. This method is similar in purpose to GetStreamAsync, however, GetStreamAsync /// doesn't detect the Stream's encoding as GetStringAsync does. /// </summary> /// <param name="httpClient"></param> public static async Task <HtmlTextReader> GetHtmlTextReaderAsync(string url, ClientOptions options) { HtmlTextReader reader; ClientOptions optionsToUse = options == null ? HtmlClient.Options : options; Uri uri = new Uri(url); // See if the url pointed to a file. If so, return a reader with a file stream // under the hood. if (uri.IsFile) { FileStream fs = File.OpenRead(uri.AbsolutePath); HtmlStream stream = new HtmlStream(fs); reader = new HtmlTextReader(stream, options.DefaultEncoding, EncodingConfidence.Tentative); reader.OriginatingUrl = url; return(reader); } // Set a user agent if one was specified if (!string.IsNullOrEmpty(optionsToUse.UserAgent)) { HttpClient.DefaultRequestHeaders.Remove("User-Agent"); HttpClient.DefaultRequestHeaders.Add("User-Agent", optionsToUse.UserAgent); } // Get the Http response (only read the headers at this point) and ensure succes HttpResponseMessage responseMessage = await HttpClient.GetAsync(uri, HttpCompletionOption.ResponseHeadersRead).ConfigureAwait(false); responseMessage.EnsureSuccessStatusCode(); // If there is no content to return, return an empty HtmlTextReader HttpContent content = responseMessage.Content; if (content == null) { reader = new HtmlTextReader(String.Empty); } else { reader = await content.GetHtmlTextReaderAsync(optionsToUse.DefaultEncoding, optionsToUse.DetectEncoding); } // Store some metadata on the reader. Could be used by a parser. reader.OriginatingUrl = url; foreach (var header in content.Headers) { reader.OriginatingHttpHeaders.Add(new KeyValuePair <string, string>(header.Key, string.Join(";", header.Value))); } return(reader); }
public void Test_HtmlStream1() { int bufsize = 10; MemoryStream baseStream = new MemoryStream(System.Text.Encoding.ASCII.GetBytes("abcdefghijklmnopqrstuvwxyz")); HtmlStream htmlstream = new HtmlStream(baseStream, bufsize); byte[] buf = new byte[bufsize]; // Test reading up to and past the cache size... // Initially, we cannot seek... Assert.AreEqual(false, htmlstream.CanSeek); htmlstream.Read(buf, 0, 5); // read first 5 bytes Assert.AreEqual(true, htmlstream.CanSeek); htmlstream.Read(buf, 5, 5); // read next 5 bytes // Here, we have now read enough data into the cache that we should be able to seek back to the start... Assert.AreEqual(true, htmlstream.CanSeek); htmlstream.Read(buf, 0, 10); // read next 10 bytes Assert.AreEqual(false, htmlstream.CanSeek); }
public void Test_HtmlStream2() { int bufsize = 10; MemoryStream baseStream = new MemoryStream(System.Text.Encoding.ASCII.GetBytes("abcdefghijklmnopqrstuvwxyz")); HtmlStream htmlstream = new HtmlStream(baseStream, bufsize); byte[] buf = new byte[bufsize]; // Test reading up to cache size... Assert.AreEqual(false, htmlstream.CanRewind); htmlstream.Read(buf, 0, 10); Assert.AreEqual(true, htmlstream.CanRewind); // Seek to the beginning - after which we should no longer be able to seek htmlstream.Rewind(); Assert.AreEqual(true, htmlstream.CanRewind); // Ensure the base stream is still in position (i.e. didn't get touched) Assert.AreEqual(10, baseStream.Position); // We should now reading from the cache byte[] buf2 = new byte[bufsize]; htmlstream.Read(buf2, 0, 5); Assert.AreEqual("abcde", System.Text.Encoding.ASCII.GetString(buf2, 0, 5)); // Base stream is still in position Assert.AreEqual(10, baseStream.Position); // If we are reading from cache, we should be able to seek to beginning again Assert.AreEqual(true, htmlstream.CanRewind); htmlstream.Read(buf2, 5, 5); Assert.AreEqual("abcdefghij", System.Text.Encoding.ASCII.GetString(buf2)); Assert.AreEqual(true, htmlstream.CanRewind); // Now move just past the cache size... We should no longer be able to seek to origin htmlstream.Read(buf2, 0, 1); Assert.AreEqual(false, htmlstream.CanRewind); }
private void ProcessDocument(Stream stream, string filename, DateTime lastModifiedTime) { Encoding encoding = null; FictionBook document = null; ApplicationLogger.WriteStringToLog(string.Format("Processing fb2 document '{0}'.", filename)); try { using (HtmlStream htmlStream = new HtmlStream(stream, Encoding.Default)) { encoding = htmlStream.Encoding; document = ReadFictionBook(htmlStream); ChangeDocumentVersion(document); if (document.ModificationType == ModificationType.None) { document.ContainerDateTime = lastModifiedTime; } } } catch (InvalidOperationException) { throw new Exception("InvalidFictionBookFormatException(exp.Message, exp)"); } catch (XmlException) { throw new Exception("InvalidFictionBookFormatException(exp.Message, exp)"); } try { if (encoding == null) { throw new Exception("Can't detect a character encoding."); } long threshold = (long)(document.Document.InnerText.Length * 0.25); if (this.preferedCodepage != null) { encoding = Encoding.GetEncoding((int)this.preferedCodepage, new EncoderCharEntityFallback(threshold), new DecoderExceptionFallback()); } else if (encoding.IsSingleByte) { encoding = Encoding.GetEncoding(encoding.CodePage, new EncoderCharEntityFallback(threshold), new DecoderExceptionFallback()); } bool done = false; int retryCount = 0; do { try { if (++retryCount > 2) { break; } if (encoding != null && document != null) { string outputFullPath = GetFilename(this.outputDirectoryGood, filename, document); string outputDirectory = "Temp"; string outputFilename = Path.GetFileName(outputFullPath).Trim(); SaveFictionBook(outputDirectory, outputFilename, document, encoding); } done = true; } catch (EncoderFallbackException) { if (encoding != null) { ApplicationLogger.WriteStringToError(string.Format("Invalid document encoding ({0}) detected, utf-8 is used instead.", encoding.WebName)); } encoding = Encoding.UTF8; } }while (!done); } catch (IOException exp) { ApplicationLogger.WriteStringToError(exp.Message); Environment.Exit(1); } catch (UnauthorizedAccessException exp) { ApplicationLogger.WriteStringToError(exp.Message); } }
/// <summary> /// /// </summary> /// <param name="stream"></param> /// <param name="fileName"></param> /// <returns></returns> public override Book Parse(Stream stream, string fileName) { Book book = new Book(fileName); book.DocumentSize = (UInt32)stream.Length; try { FB2File fb2 = new FB2File(); // Load header only stream.Position = 0; // Project Mono has a bug: Xdocument.Load() can't detect encoding string encoding = string.Empty; if (Utils.IsLinux) { using (StreamReader sr = new StreamReader(stream)) { encoding = sr.ReadLine(); int idx = encoding.ToLower().IndexOf("encoding=\""); if (idx > 0) { encoding = encoding.Substring(idx + 10); encoding = encoding.Substring(0, encoding.IndexOf('"')); stream.Position = 0; using (StreamReader esr = new StreamReader(stream, Encoding.GetEncoding(encoding))) { string xmlStr = esr.ReadToEnd(); try { xml = XDocument.Parse(xmlStr, LoadOptions.PreserveWhitespace); } catch { stream.Position = 0; using (HtmlStream reader = new HtmlStream(stream, Encoding.Default)) { using (SgmlReader sgmlReader = new SgmlReader()) { sgmlReader.InputStream = reader; sgmlReader.Dtd = LoadFb2Dtd(sgmlReader); xml = XDocument.Load(sgmlReader); } } } } } } } if (xml == null) { try { xml = XDocument.Load(stream); } catch { stream.Position = 0; // This code will try to use the sgml based reader for not well-formed xml files using (HtmlStream reader = new HtmlStream(stream, Encoding.Default)) { using (SgmlReader sgmlReader = new SgmlReader()) { sgmlReader.InputStream = reader; sgmlReader.Dtd = LoadFb2Dtd(sgmlReader); xml = XDocument.Load(sgmlReader); } } } } fb2.Load(xml, true); if (fb2.DocumentInfo != null) { book.ID = fb2.DocumentInfo.ID; if (fb2.DocumentInfo.DocumentVersion != null) { book.Version = (float)fb2.DocumentInfo.DocumentVersion; } if (fb2.DocumentInfo.DocumentDate != null) { book.DocumentDate = fb2.DocumentInfo.DocumentDate.DateValue; } } if (fb2.TitleInfo != null) { if (fb2.TitleInfo.Cover != null && fb2.TitleInfo.Cover.HasImages()) { book.HasCover = true; } if (fb2.TitleInfo.BookTitle != null) { book.Title = fb2.TitleInfo.BookTitle.Text; } if (fb2.TitleInfo.Annotation != null) { book.Annotation = fb2.TitleInfo.Annotation.ToString(); } if (fb2.TitleInfo.Sequences != null && fb2.TitleInfo.Sequences.Count > 0) { book.Sequence = fb2.TitleInfo.Sequences.First().Name.Capitalize(true); if (fb2.TitleInfo.Sequences.First().Number != null) { book.NumberInSequence = (UInt32)(fb2.TitleInfo.Sequences.First().Number); } } if (fb2.TitleInfo.Language != null) { book.Language = fb2.TitleInfo.Language; } if (fb2.TitleInfo.BookDate != null) { book.BookDate = fb2.TitleInfo.BookDate.DateValue; } if (fb2.TitleInfo.BookAuthors != null && fb2.TitleInfo.BookAuthors.Any()) { book.Authors = new List <string>(); book.Authors.AddRange(from ba in fb2.TitleInfo.BookAuthors select string.Concat(ba.LastName, " ", ba.FirstName, " ", ba.MiddleName).Replace(" ", " ").Capitalize()); } if (fb2.TitleInfo.Translators != null && fb2.TitleInfo.Translators.Any()) { book.Translators = new List <string>(); book.Translators.AddRange(from ba in fb2.TitleInfo.Translators select string.Concat(ba.LastName, " ", ba.FirstName, " ", ba.MiddleName).Replace(" ", " ").Capitalize()); } if (fb2.TitleInfo.Genres != null && fb2.TitleInfo.Genres.Any()) { book.Genres = new List <string>(); book.Genres.AddRange((from g in fb2.TitleInfo.Genres select g.Genre).ToList()); } } } catch (Exception e) { Log.WriteLine(LogLevel.Error, "Book.Parse() exception {0} on file: {1}", e.Message, fileName); } finally { if (stream != null) { stream.Dispose(); stream = null; } } return(book); }
/// <summary> /// Begins processing an entity. /// </summary> /// <param name="parent">The parent of this entity.</param> /// <param name="baseUri">The base Uri for processing this entity within.</param> public void Open( Entity parent, Uri baseUri ) { Parent = parent; if( parent != null ) _isHtml = parent.IsHtml; Line = 1; if( IsInternal ) { if( Literal != null ) _stm = new StringReader( Literal ); return; } if( Uri == null ) { Error( "Unresolvable entity '{0}'", Name ); return; } _resolvedUri = baseUri != null ? new Uri( baseUri, Uri ) : new Uri( Uri ); Stream stream; var encoding = Encoding.Default; switch( _resolvedUri.Scheme ) { case "file": var path = _resolvedUri.LocalPath; stream = new FileStream( path, FileMode.Open, FileAccess.Read ); break; default: var response = GetWebResponse(); var actual = response.ResponseUri; if( !actual.AbsoluteUri.EqualsIgnoreCase( _resolvedUri.AbsoluteUri ) ) { _resolvedUri = actual; } var contentType = response.ContentType.ToLowerInvariant(); encoding = GetEncoding( contentType ); stream = response.GetResponseStream(); break; } _weOwnTheStream = true; var html = new HtmlStream( stream, encoding ); Encoding = html.Encoding; _stm = html; }