Exemplo n.º 1
0
        public void Test_HtmlStream4()
        {
            int          bufsize    = 10;
            MemoryStream baseStream = new MemoryStream(System.Text.Encoding.ASCII.GetBytes("abcdefghijklmnopqrstuvwxyz"));
            HtmlStream   htmlstream = new HtmlStream(baseStream, bufsize);

            byte[] buf = new byte[15];

            // Test reading up to and beyond the cache size (we should still cache it all)...
            Assert.AreEqual(false, htmlstream.CanRewind);
            htmlstream.Read(buf, 0, buf.Length);
            Assert.AreEqual(true, htmlstream.CanRewind);

            // Seek to the beginning
            htmlstream.Rewind();
            Assert.AreEqual(true, htmlstream.CanRewind);

            // Ensure the base stream is still in position (i.e. didn't get touched)
            Assert.AreEqual(15, baseStream.Position);

            // We should now reading fully from the cache
            byte[] buf2 = new byte[15];
            htmlstream.Read(buf2, 0, 15);
            Assert.AreEqual("abcdefghijklmno", System.Text.Encoding.ASCII.GetString(buf2));
            Assert.IsTrue(htmlstream.CanRewind, "CanRewind should be true, since we should have expanded our cache");
        }
Exemplo n.º 2
0
        public void Test_HtmlStream3()
        {
            int          bufsize    = 10;
            MemoryStream baseStream = new MemoryStream(System.Text.Encoding.ASCII.GetBytes("abcdefghijklmnopqrstuvwxyz"));
            HtmlStream   htmlstream = new HtmlStream(baseStream, bufsize);

            byte[] buf = new byte[bufsize];

            // Test reading up to cache size...
            Assert.AreEqual(false, htmlstream.CanRewind);
            htmlstream.Read(buf, 0, 10);
            Assert.AreEqual(true, htmlstream.CanRewind);

            // Seek to the beginning
            htmlstream.Rewind();
            Assert.AreEqual(true, htmlstream.CanRewind);

            // Ensure the base stream is still in position (i.e. didn't get touched)
            Assert.AreEqual(10, baseStream.Position);

            // We should now reading from the cache, and in this case
            // a little bit of the real stream...
            byte[] buf2 = new byte[15];
            htmlstream.Read(buf2, 0, 15);
            Assert.AreEqual("abcdefghijklmno", System.Text.Encoding.ASCII.GetString(buf2));
            Assert.IsFalse(htmlstream.CanRewind, "CanRewind should be false, since we should be past the cache");
        }
Exemplo n.º 3
0
        /// <summary>
        /// Returns a TextReader that detects the underlying stream's endoding. Allows clients to stream the
        /// retured content using a TextReader. This method is similar in purpose to GetStreamAsync, however, GetStreamAsync
        /// doesn't detect the Stream's encoding as GetStringAsync does.
        /// </summary>
        /// <param name="httpClient"></param>
        public static async Task <HtmlTextReader> GetHtmlTextReaderAsync(string url, ClientOptions options)
        {
            HtmlTextReader reader;
            ClientOptions  optionsToUse = options == null ? HtmlClient.Options : options;
            Uri            uri          = new Uri(url);

            // See if the url pointed to a file. If so, return a reader with a file stream
            // under the hood.
            if (uri.IsFile)
            {
                FileStream fs     = File.OpenRead(uri.AbsolutePath);
                HtmlStream stream = new HtmlStream(fs);
                reader = new HtmlTextReader(stream, options.DefaultEncoding, EncodingConfidence.Tentative);
                reader.OriginatingUrl = url;
                return(reader);
            }

            // Set a user agent if one was specified
            if (!string.IsNullOrEmpty(optionsToUse.UserAgent))
            {
                HttpClient.DefaultRequestHeaders.Remove("User-Agent");
                HttpClient.DefaultRequestHeaders.Add("User-Agent", optionsToUse.UserAgent);
            }

            // Get the Http response (only read the headers at this point) and ensure succes
            HttpResponseMessage responseMessage = await HttpClient.GetAsync(uri, HttpCompletionOption.ResponseHeadersRead).ConfigureAwait(false);

            responseMessage.EnsureSuccessStatusCode();

            // If there is no content to return, return an empty HtmlTextReader
            HttpContent content = responseMessage.Content;

            if (content == null)
            {
                reader = new HtmlTextReader(String.Empty);
            }
            else
            {
                reader = await content.GetHtmlTextReaderAsync(optionsToUse.DefaultEncoding, optionsToUse.DetectEncoding);
            }

            // Store some metadata on the reader. Could be used by a parser.
            reader.OriginatingUrl = url;
            foreach (var header in content.Headers)
            {
                reader.OriginatingHttpHeaders.Add(new KeyValuePair <string, string>(header.Key, string.Join(";", header.Value)));
            }

            return(reader);
        }
Exemplo n.º 4
0
        public void Test_HtmlStream1()
        {
            int          bufsize    = 10;
            MemoryStream baseStream = new MemoryStream(System.Text.Encoding.ASCII.GetBytes("abcdefghijklmnopqrstuvwxyz"));
            HtmlStream   htmlstream = new HtmlStream(baseStream, bufsize);

            byte[] buf = new byte[bufsize];

            // Test reading up to and past the cache size...
            // Initially, we cannot seek...
            Assert.AreEqual(false, htmlstream.CanSeek);
            htmlstream.Read(buf, 0, 5); // read first 5 bytes
            Assert.AreEqual(true, htmlstream.CanSeek);
            htmlstream.Read(buf, 5, 5); // read next 5 bytes
            // Here, we have now read enough data into the cache that we should be able to seek back to the start...
            Assert.AreEqual(true, htmlstream.CanSeek);
            htmlstream.Read(buf, 0, 10); // read next 10 bytes
            Assert.AreEqual(false, htmlstream.CanSeek);
        }
Exemplo n.º 5
0
        public void Test_HtmlStream2()
        {
            int          bufsize    = 10;
            MemoryStream baseStream = new MemoryStream(System.Text.Encoding.ASCII.GetBytes("abcdefghijklmnopqrstuvwxyz"));
            HtmlStream   htmlstream = new HtmlStream(baseStream, bufsize);

            byte[] buf = new byte[bufsize];

            // Test reading up to cache size...
            Assert.AreEqual(false, htmlstream.CanRewind);
            htmlstream.Read(buf, 0, 10);
            Assert.AreEqual(true, htmlstream.CanRewind);

            // Seek to the beginning - after which we should no longer be able to seek
            htmlstream.Rewind();
            Assert.AreEqual(true, htmlstream.CanRewind);

            // Ensure the base stream is still in position (i.e. didn't get touched)
            Assert.AreEqual(10, baseStream.Position);

            // We should now reading from the cache
            byte[] buf2 = new byte[bufsize];
            htmlstream.Read(buf2, 0, 5);
            Assert.AreEqual("abcde", System.Text.Encoding.ASCII.GetString(buf2, 0, 5));

            // Base stream is still in position
            Assert.AreEqual(10, baseStream.Position);

            // If we are reading from cache, we should be able to seek to beginning again
            Assert.AreEqual(true, htmlstream.CanRewind);
            htmlstream.Read(buf2, 5, 5);
            Assert.AreEqual("abcdefghij", System.Text.Encoding.ASCII.GetString(buf2));
            Assert.AreEqual(true, htmlstream.CanRewind);

            // Now move just past the cache size... We should no longer be able to seek to origin
            htmlstream.Read(buf2, 0, 1);
            Assert.AreEqual(false, htmlstream.CanRewind);
        }
Exemplo n.º 6
0
        private void ProcessDocument(Stream stream, string filename, DateTime lastModifiedTime)
        {
            Encoding    encoding = null;
            FictionBook document = null;

            ApplicationLogger.WriteStringToLog(string.Format("Processing fb2 document '{0}'.", filename));

            try
            {
                using (HtmlStream htmlStream = new HtmlStream(stream, Encoding.Default))
                {
                    encoding = htmlStream.Encoding;
                    document = ReadFictionBook(htmlStream);

                    ChangeDocumentVersion(document);

                    if (document.ModificationType == ModificationType.None)
                    {
                        document.ContainerDateTime = lastModifiedTime;
                    }
                }
            }
            catch (InvalidOperationException)
            {
                throw new Exception("InvalidFictionBookFormatException(exp.Message, exp)");
            }
            catch (XmlException)
            {
                throw new Exception("InvalidFictionBookFormatException(exp.Message, exp)");
            }

            try
            {
                if (encoding == null)
                {
                    throw new Exception("Can't detect a character encoding.");
                }

                long threshold = (long)(document.Document.InnerText.Length * 0.25);

                if (this.preferedCodepage != null)
                {
                    encoding = Encoding.GetEncoding((int)this.preferedCodepage, new EncoderCharEntityFallback(threshold), new DecoderExceptionFallback());
                }
                else if (encoding.IsSingleByte)
                {
                    encoding = Encoding.GetEncoding(encoding.CodePage, new EncoderCharEntityFallback(threshold), new DecoderExceptionFallback());
                }

                bool done       = false;
                int  retryCount = 0;

                do
                {
                    try
                    {
                        if (++retryCount > 2)
                        {
                            break;
                        }

                        if (encoding != null && document != null)
                        {
                            string outputFullPath  = GetFilename(this.outputDirectoryGood, filename, document);
                            string outputDirectory = "Temp";
                            string outputFilename  = Path.GetFileName(outputFullPath).Trim();

                            SaveFictionBook(outputDirectory, outputFilename, document, encoding);
                        }

                        done = true;
                    }
                    catch (EncoderFallbackException)
                    {
                        if (encoding != null)
                        {
                            ApplicationLogger.WriteStringToError(string.Format("Invalid document encoding ({0}) detected, utf-8 is used instead.", encoding.WebName));
                        }

                        encoding = Encoding.UTF8;
                    }
                }while (!done);
            }
            catch (IOException exp)
            {
                ApplicationLogger.WriteStringToError(exp.Message);
                Environment.Exit(1);
            }
            catch (UnauthorizedAccessException exp)
            {
                ApplicationLogger.WriteStringToError(exp.Message);
            }
        }
Exemplo n.º 7
0
        /// <summary>
        ///
        /// </summary>
        /// <param name="stream"></param>
        /// <param name="fileName"></param>
        /// <returns></returns>
        public override Book Parse(Stream stream, string fileName)
        {
            Book book = new Book(fileName);

            book.DocumentSize = (UInt32)stream.Length;

            try
            {
                FB2File fb2 = new FB2File();
                // Load header only
                stream.Position = 0;

                // Project Mono has a bug: Xdocument.Load() can't detect encoding
                string encoding = string.Empty;
                if (Utils.IsLinux)
                {
                    using (StreamReader sr = new StreamReader(stream))
                    {
                        encoding = sr.ReadLine();
                        int idx = encoding.ToLower().IndexOf("encoding=\"");
                        if (idx > 0)
                        {
                            encoding        = encoding.Substring(idx + 10);
                            encoding        = encoding.Substring(0, encoding.IndexOf('"'));
                            stream.Position = 0;
                            using (StreamReader esr = new StreamReader(stream, Encoding.GetEncoding(encoding)))
                            {
                                string xmlStr = esr.ReadToEnd();
                                try
                                {
                                    xml = XDocument.Parse(xmlStr, LoadOptions.PreserveWhitespace);
                                }
                                catch
                                {
                                    stream.Position = 0;

                                    using (HtmlStream reader = new HtmlStream(stream, Encoding.Default))
                                    {
                                        using (SgmlReader sgmlReader = new SgmlReader())
                                        {
                                            sgmlReader.InputStream = reader;
                                            sgmlReader.Dtd         = LoadFb2Dtd(sgmlReader);

                                            xml = XDocument.Load(sgmlReader);
                                        }
                                    }
                                }
                            }
                        }
                    }
                }

                if (xml == null)
                {
                    try
                    {
                        xml = XDocument.Load(stream);
                    }
                    catch
                    {
                        stream.Position = 0;

                        // This code will try to use the sgml based reader for not well-formed xml files
                        using (HtmlStream reader = new HtmlStream(stream, Encoding.Default))
                        {
                            using (SgmlReader sgmlReader = new SgmlReader())
                            {
                                sgmlReader.InputStream = reader;
                                sgmlReader.Dtd         = LoadFb2Dtd(sgmlReader);

                                xml = XDocument.Load(sgmlReader);
                            }
                        }
                    }
                }

                fb2.Load(xml, true);

                if (fb2.DocumentInfo != null)
                {
                    book.ID = fb2.DocumentInfo.ID;
                    if (fb2.DocumentInfo.DocumentVersion != null)
                    {
                        book.Version = (float)fb2.DocumentInfo.DocumentVersion;
                    }
                    if (fb2.DocumentInfo.DocumentDate != null)
                    {
                        book.DocumentDate = fb2.DocumentInfo.DocumentDate.DateValue;
                    }
                }

                if (fb2.TitleInfo != null)
                {
                    if (fb2.TitleInfo.Cover != null && fb2.TitleInfo.Cover.HasImages())
                    {
                        book.HasCover = true;
                    }
                    if (fb2.TitleInfo.BookTitle != null)
                    {
                        book.Title = fb2.TitleInfo.BookTitle.Text;
                    }
                    if (fb2.TitleInfo.Annotation != null)
                    {
                        book.Annotation = fb2.TitleInfo.Annotation.ToString();
                    }
                    if (fb2.TitleInfo.Sequences != null && fb2.TitleInfo.Sequences.Count > 0)
                    {
                        book.Sequence = fb2.TitleInfo.Sequences.First().Name.Capitalize(true);
                        if (fb2.TitleInfo.Sequences.First().Number != null)
                        {
                            book.NumberInSequence = (UInt32)(fb2.TitleInfo.Sequences.First().Number);
                        }
                    }
                    if (fb2.TitleInfo.Language != null)
                    {
                        book.Language = fb2.TitleInfo.Language;
                    }
                    if (fb2.TitleInfo.BookDate != null)
                    {
                        book.BookDate = fb2.TitleInfo.BookDate.DateValue;
                    }
                    if (fb2.TitleInfo.BookAuthors != null && fb2.TitleInfo.BookAuthors.Any())
                    {
                        book.Authors = new List <string>();
                        book.Authors.AddRange(from ba in fb2.TitleInfo.BookAuthors select string.Concat(ba.LastName, " ", ba.FirstName, " ", ba.MiddleName).Replace("  ", " ").Capitalize());
                    }
                    if (fb2.TitleInfo.Translators != null && fb2.TitleInfo.Translators.Any())
                    {
                        book.Translators = new List <string>();
                        book.Translators.AddRange(from ba in fb2.TitleInfo.Translators select string.Concat(ba.LastName, " ", ba.FirstName, " ", ba.MiddleName).Replace("  ", " ").Capitalize());
                    }
                    if (fb2.TitleInfo.Genres != null && fb2.TitleInfo.Genres.Any())
                    {
                        book.Genres = new List <string>();
                        book.Genres.AddRange((from g in fb2.TitleInfo.Genres select g.Genre).ToList());
                    }
                }
            }
            catch (Exception e)
            {
                Log.WriteLine(LogLevel.Error, "Book.Parse() exception {0} on file: {1}", e.Message, fileName);
            }
            finally
            {
                if (stream != null)
                {
                    stream.Dispose();
                    stream = null;
                }
            }

            return(book);
        }
Exemplo n.º 8
0
        /// <summary>
        /// Begins processing an entity.
        /// </summary>
        /// <param name="parent">The parent of this entity.</param>
        /// <param name="baseUri">The base Uri for processing this entity within.</param>
        public void Open( Entity parent, Uri baseUri )
        {
            Parent = parent;

              if( parent != null )
            _isHtml = parent.IsHtml;

              Line = 1;

              if( IsInternal ) {
            if( Literal != null )
              _stm = new StringReader( Literal );

            return;
              }

              if( Uri == null ) {
            Error( "Unresolvable entity '{0}'", Name );
            return;
              }

              _resolvedUri = baseUri != null ? new Uri( baseUri, Uri ) : new Uri( Uri );

              Stream stream;
              var encoding = Encoding.Default;
              switch( _resolvedUri.Scheme ) {
            case "file":
              var path = _resolvedUri.LocalPath;
              stream = new FileStream( path, FileMode.Open, FileAccess.Read );

              break;
            default:
              var response = GetWebResponse();
              var actual = response.ResponseUri;

              if( !actual.AbsoluteUri.EqualsIgnoreCase( _resolvedUri.AbsoluteUri ) ) {
            _resolvedUri = actual;
              }

              var contentType = response.ContentType.ToLowerInvariant();
              encoding = GetEncoding( contentType );
              stream = response.GetResponseStream();

              break;
              }

              _weOwnTheStream = true;

              var html = new HtmlStream( stream, encoding );
              Encoding = html.Encoding;
              _stm = html;
        }