Beispiel #1
0
        public void ReadXml(XmlReader reader)
        {
            var serializationMode = Serialization.Selected;

            if (serializationMode == Serialization.Mode.SpacyAndDotNet)
            {
                reader.ReadStartElement();
                Debug.Assert(reader.Name == $"{Serialization.Prefix}:PyObj");
                var bytesB64 = reader.ReadElementContentAsString();
                var bytes    = Convert.FromBase64String(bytesB64);
                using (Py.GIL())
                {
                    dynamic spacy = Py.Import("spacy");
                    PyVocab = spacy.vocab.Vocab.__call__();

                    var pyBytes = ToPython.GetBytes(bytes);
                    PyVocab.from_bytes(pyBytes);
                }

                reader.ReadEndElement();
            }
            else
            {
                reader.Skip();
            }

            Debug.Assert(serializationMode != Serialization.Mode.Spacy);
        }
Beispiel #2
0
        public void ReadXml(XmlReader reader)
        {
            var serializationMode = Serialization.Selected;

            reader.MoveToContent();

            Debug.Assert(reader.Name == $"{Serialization.Prefix}:DocBin");
            reader.ReadStartElement();

            if (serializationMode == Serialization.Mode.SpacyAndDotNet)
            {
                Debug.Assert(reader.Name == $"{Serialization.Prefix}:PyObj");
                var bytesB64 = reader.ReadElementContentAsString();
                var bytes    = Convert.FromBase64String(bytesB64);

                using (Py.GIL())
                {
                    dynamic spacy = Py.Import("spacy");
                    _pyDocBin = spacy.tokens.DocBin.__call__();

                    var pyBytes = ToPython.GetBytes(bytes);
                    _pyDocBin.from_bytes(pyBytes);
                }
            }

            Debug.Assert(serializationMode != Serialization.Mode.Spacy);

            Debug.Assert(reader.Name == $"{Serialization.Prefix}:Docs");
            reader.ReadStartElement();
            _docs = new List <Doc>();

            while (reader.MoveToContent() != XmlNodeType.EndElement)
            {
                if (reader.NodeType != XmlNodeType.EndElement)
                {
                    var doc = new Doc();
                    doc.ReadXml(reader);
                    _docs.Add(doc);
                }
            }

            reader.ReadEndElement();
        }
Beispiel #3
0
        public void ReadXml(XmlReader reader)
        {
            // TODO: Yet to debug. It's not being used so far
            Debug.Assert(reader.Name == $"{Serialization.Prefix}:PyObj");
            var bytesB64 = reader.ReadElementContentAsString();
            var bytes    = Convert.FromBase64String(bytesB64);

            using (Py.GIL())
            {
                var pyBytes = ToPython.GetBytes(bytes);
                _pyLexeme.from_bytes(pyBytes);
            }

            Debug.Assert(reader.Name == $"{Serialization.Prefix}:Text");
            _text = reader.ReadElementContentAsString();
            Debug.Assert(reader.Name == $"{Serialization.Prefix}:Shape");
            _shape = reader.ReadElementContentAsString();
            Debug.Assert(reader.Name == $"{Serialization.Prefix}:Prefix");
            _prefix = reader.ReadElementContentAsString();
            Debug.Assert(reader.Name == $"{Serialization.Prefix}:Suffix");
            _suffix = reader.ReadElementContentAsString();
            Debug.Assert(reader.Name == $"{Serialization.Prefix}:Lang");
            _lang = reader.ReadElementContentAsString();

            Debug.Assert(reader.Name == $"{Serialization.Prefix}:Orth");
            var orth = reader.ReadElementContentAsString();

            _orth = BigInteger.Parse(orth);

            Debug.Assert(reader.Name == $"{Serialization.Prefix}:IsAlpha");
            _isAlpha = reader.ReadElementContentAsBoolean();
            Debug.Assert(reader.Name == $"{Serialization.Prefix}:IsDigit");
            _isDigit = reader.ReadElementContentAsBoolean();
            Debug.Assert(reader.Name == $"{Serialization.Prefix}:IsTitle");
            _isTitle = reader.ReadElementContentAsBoolean();
        }
Beispiel #4
0
        public void FromBytes(byte[] bytes)
        {
            if (Serialization.Selected == Serialization.Mode.Spacy)
            {
                using (Py.GIL())
                {
                    var pyBytes = ToPython.GetBytes(bytes);
                    PyDoc.from_bytes(pyBytes);
                }
            }
            else
            {
                var stream = new MemoryStream(bytes);

                var settings = new XmlReaderSettings();
                settings.IgnoreComments   = true;
                settings.IgnoreWhitespace = true;
                var reader = XmlReader.Create(stream, settings);

                var doc = new Doc();
                doc.ReadXml(reader);
                Copy(doc);
            }
        }
Beispiel #5
0
        public void ReadXml(XmlReader reader)
        {
            var serializationMode = Serialization.Selected;

            reader.MoveToContent();

            Debug.Assert(reader.Name == $"{Serialization.Prefix}:Doc");
            reader.ReadStartElement();

            if (serializationMode == Serialization.Mode.SpacyAndDotNet)
            {
                Debug.Assert(reader.Name == $"{Serialization.Prefix}:PyObj");
                var bytesB64 = reader.ReadElementContentAsString();
                var bytes    = Convert.FromBase64String(bytesB64);
                using (Py.GIL())
                {
                    dynamic spacy   = Py.Import("spacy");
                    dynamic pyVocab = spacy.vocab.Vocab.__call__();
                    PyDoc = spacy.tokens.doc.Doc.__call__(pyVocab);

                    var pyBytes = ToPython.GetBytes(bytes);
                    PyDoc.from_bytes(pyBytes);
                    _vocab = new Vocab(PyDoc.vocab);
                }
            }

            Debug.Assert(Serialization.Selected != Serialization.Mode.Spacy);

            Debug.Assert(reader.Name == $"{Serialization.Prefix}:Text");
            _text = reader.ReadElementContentAsString();

            Debug.Assert(reader.Name == $"{Serialization.Prefix}:Vocab");
            _vocab = new Vocab(null);
            _vocab.ReadXml(reader);

            Debug.Assert(reader.Name == $"{Serialization.Prefix}:Tokens");
            _tokens = new List <Token>();
            var isEmpty = reader.IsEmptyElement;

            reader.ReadStartElement();

            if (!isEmpty)
            {
                while (reader.MoveToContent() != XmlNodeType.EndElement)
                {
                    Debug.Assert(reader.Name == $"{Serialization.Prefix}:Token");
                    reader.ReadStartElement();
                    if (reader.NodeType != XmlNodeType.EndElement)
                    {
                        var token = new Token();
                        token.ReadXml(reader);
                        _tokens.Add(token);
                        reader.ReadEndElement();
                    }
                }

                reader.ReadEndElement();
            }

            foreach (var token in _tokens)
            {
                token.RestoreHead(_tokens);
            }

            Debug.Assert(reader.Name == $"{Serialization.Prefix}:Sentences");
            _sentences = new List <Span>();
            isEmpty    = reader.IsEmptyElement;
            reader.ReadStartElement();

            if (!isEmpty)
            {
                while (reader.MoveToContent() != XmlNodeType.EndElement)
                {
                    Debug.Assert(reader.Name == $"{Serialization.Prefix}:Sent");
                    reader.ReadStartElement();
                    if (reader.NodeType != XmlNodeType.EndElement)
                    {
                        var sent = new Span();
                        sent.ReadXml(reader);
                        _sentences.Add(sent);
                        reader.ReadEndElement();
                    }
                }

                reader.ReadEndElement();
            }

            Debug.Assert(reader.Name == $"{Serialization.Prefix}:NounChunks");
            _nounChunks = new List <Span>();
            isEmpty     = reader.IsEmptyElement;
            reader.ReadStartElement();

            if (!isEmpty)
            {
                while (reader.MoveToContent() != XmlNodeType.EndElement)
                {
                    Debug.Assert(reader.Name == $"{Serialization.Prefix}:NounChunk");
                    reader.ReadStartElement();
                    if (reader.NodeType != XmlNodeType.EndElement)
                    {
                        var nChunk = new Span();
                        nChunk.ReadXml(reader);
                        _nounChunks.Add(nChunk);
                        reader.ReadEndElement();
                    }
                }

                reader.ReadEndElement();
            }

            Debug.Assert(reader.Name == $"{Serialization.Prefix}:Ents");
            _ents = new List <Span>();
            reader.ReadStartElement();

            while (reader.MoveToContent() != XmlNodeType.EndElement)
            {
                Debug.Assert(reader.Name == $"{Serialization.Prefix}:Ent");
                reader.ReadStartElement();
                if (reader.NodeType != XmlNodeType.EndElement)
                {
                    var ent = new Span();
                    ent.ReadXml(reader);
                    _ents.Add(ent);
                    reader.ReadEndElement();
                }
            }

            reader.ReadEndElement();
        }