コード例 #1
0
        public virtual void TestParseDeclarationAttributes()
        {
            String   xml = "<?xml version='1' encoding='UTF-8' something='else'?><val>One</val>";
            Document doc = iText.StyledXmlParser.Jsoup.Jsoup.Parse(xml, "", iText.StyledXmlParser.Jsoup.Parser.Parser.
                                                                   XmlParser());
            XmlDeclaration decl = (XmlDeclaration)doc.ChildNode(0);

            NUnit.Framework.Assert.AreEqual("1", decl.Attr("version"));
            NUnit.Framework.Assert.AreEqual("UTF-8", decl.Attr("encoding"));
            NUnit.Framework.Assert.AreEqual("else", decl.Attr("something"));
            NUnit.Framework.Assert.AreEqual("version=\"1\" encoding=\"UTF-8\" something=\"else\"", decl.GetWholeDeclaration
                                                ());
            NUnit.Framework.Assert.AreEqual("<?xml version=\"1\" encoding=\"UTF-8\" something=\"else\"?>", decl.OuterHtml
                                                ());
        }
コード例 #2
0
ファイル: DataUtil.cs プロジェクト: zymemail/itext7-dotnet
        // reads bytes first into a buffer, then decodes with the appropriate charset. done this way to support
        // switching the chartset midstream when a meta http-equiv tag defines the charset.
        // todo - this is getting gnarly. needs a rewrite.
        internal static Document ParseByteData(ByteBuffer byteData, String charsetName, String baseUri, iText.StyledXmlParser.Jsoup.Parser.Parser
                                               parser)
        {
            String   docData;
            Document doc = null;

            // look for BOM - overrides any other header or input
            charsetName = DetectCharsetFromBom(byteData, charsetName);
            if (charsetName == null)
            {
                // determine from meta. safe first parse as UTF-8
                // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
                docData = EncodingUtil.GetEncoding(defaultCharset).Decode(byteData).ToString();
                doc     = parser.ParseInput(docData, baseUri);
                iText.StyledXmlParser.Jsoup.Nodes.Element meta = doc.Select("meta[http-equiv=content-type], meta[charset]"
                                                                            ).First();
                String foundCharset = null;
                // if not found, will keep utf-8 as best attempt
                if (meta != null)
                {
                    if (meta.HasAttr("http-equiv"))
                    {
                        foundCharset = GetCharsetFromContentType(meta.Attr("content"));
                    }
                    if (foundCharset == null && meta.HasAttr("charset"))
                    {
                        foundCharset = meta.Attr("charset");
                    }
                }
                // look for <?xml encoding='ISO-8859-1'?>
                if (foundCharset == null && doc.ChildNode(0) is XmlDeclaration)
                {
                    XmlDeclaration prolog = (XmlDeclaration)doc.ChildNode(0);
                    if (prolog.Name().Equals("xml"))
                    {
                        foundCharset = prolog.Attr("encoding");
                    }
                }
                foundCharset = ValidateCharset(foundCharset);
                if (foundCharset != null && !foundCharset.Equals(defaultCharset))
                {
                    // need to re-decode
                    foundCharset = iText.IO.Util.StringUtil.ReplaceAll(foundCharset.Trim(), "[\"']", "");
                    charsetName  = foundCharset;
                    byteData.Rewind();
                    docData = EncodingUtil.GetEncoding(foundCharset).Decode(byteData).ToString();
                    doc     = null;
                }
            }
            else
            {
                // specified by content type header (or by user on file load)
                Validate.NotEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML"
                                  );
                docData = EncodingUtil.GetEncoding(charsetName).Decode(byteData).ToString();
            }
            if (doc == null)
            {
                doc = parser.ParseInput(docData, baseUri);
                doc.OutputSettings().Charset(charsetName);
            }
            return(doc);
        }