コード例 #1
0
        private void ParseDocumentCoNLLUP(string fileName, LegalTextParserFactory parserFactory)
        {
            //Convert to an XML representation
            CoNLLDocument sourceDoc = null;

            //Parse the same way as XML
            ParseDocumentXml(sourceDoc, parserFactory);
        }
コード例 #2
0
        private void ParseDocumentXml(CoNLLDocument sourceDoc, LegalTextParserFactory parserFactory)
        {
            if (sourceDoc == null || sourceDoc.doc == null)
            {
                throw new InvalidOperationException("Unsupported document found in corpus!");
            }

            var parser = parserFactory.CreateParser(sourceDoc.doc.language);

            var pass1 = parser.ParsePass1(sourceDoc);

            parser.ParsePass2(pass1);

            m_marcellDocument = pass1;
        }
コード例 #3
0
        private void ParseDocumentXml(string fileName, LegalTextParserFactory parserFactory)
        {
            CoNLLDocument sourceDoc = null;

            using (var fileReader = new SanitizedStreamReader(fileName))
            {
                using (var xmlReader = XmlReader.Create(fileReader))
                {
                    sourceDoc = (CoNLLDocument)SerializerCoNLLUP.Deserialize(xmlReader);
                }
            }

            if (sourceDoc == null || sourceDoc.doc == null)
            {
                throw new InvalidOperationException("Unsupported document found in corpus!");
            }

            //Preprocess the document to fix common problems in xml files
            Preparse(sourceDoc, fileName);

            ParseDocumentXml(sourceDoc, parserFactory);
        }
コード例 #4
0
        private void Preparse(CoNLLDocument sourceDoc, string fileName)
        {
            //Ensure we have a language
            if (string.IsNullOrEmpty(sourceDoc.doc.language))
            {
                sourceDoc.doc.language = Path.GetFileNameWithoutExtension(fileName).Substring(0, 2);
            }

            //Ensure we have paragraphs (some Xml files are missing the "p" elements) by moving the sentences directly in the doc to a new automatically generated paragraph
            if (sourceDoc.doc.p == null)
            {
                sourceDoc.doc.p = new CoNLLUP.textDocP[1]
                {
                    new CoNLLUP.textDocP
                    {
                        id = "autoparagraph_fix",
                        s  = sourceDoc.doc.s
                    }
                };
            }
            else
            {
                if (sourceDoc.doc.s != null)
                {
                    //We seem to have sentences out of paragraphs, create a new paragraph and put them there
                    var tempBuffer = sourceDoc.doc.p;
                    sourceDoc.doc.p = new CoNLLUP.textDocP[sourceDoc.doc.p.Length + 1];
                    Array.Copy(tempBuffer, sourceDoc.doc.p, tempBuffer.Length);
                    sourceDoc.doc.p[tempBuffer.Length] = new CoNLLUP.textDocP
                    {
                        id = "autoparagraph_fix",
                        s  = sourceDoc.doc.s
                    };
                }
            }
            sourceDoc.doc.s = null;
        }