private void ParseDocumentCoNLLUP(string fileName, LegalTextParserFactory parserFactory) { //Convert to an XML representation CoNLLDocument sourceDoc = null; //Parse the same way as XML ParseDocumentXml(sourceDoc, parserFactory); }
private void ParseDocumentXml(CoNLLDocument sourceDoc, LegalTextParserFactory parserFactory) { if (sourceDoc == null || sourceDoc.doc == null) { throw new InvalidOperationException("Unsupported document found in corpus!"); } var parser = parserFactory.CreateParser(sourceDoc.doc.language); var pass1 = parser.ParsePass1(sourceDoc); parser.ParsePass2(pass1); m_marcellDocument = pass1; }
private void ParseDocumentXml(string fileName, LegalTextParserFactory parserFactory) { CoNLLDocument sourceDoc = null; using (var fileReader = new SanitizedStreamReader(fileName)) { using (var xmlReader = XmlReader.Create(fileReader)) { sourceDoc = (CoNLLDocument)SerializerCoNLLUP.Deserialize(xmlReader); } } if (sourceDoc == null || sourceDoc.doc == null) { throw new InvalidOperationException("Unsupported document found in corpus!"); } //Preprocess the document to fix common problems in xml files Preparse(sourceDoc, fileName); ParseDocumentXml(sourceDoc, parserFactory); }
private void Preparse(CoNLLDocument sourceDoc, string fileName) { //Ensure we have a language if (string.IsNullOrEmpty(sourceDoc.doc.language)) { sourceDoc.doc.language = Path.GetFileNameWithoutExtension(fileName).Substring(0, 2); } //Ensure we have paragraphs (some Xml files are missing the "p" elements) by moving the sentences directly in the doc to a new automatically generated paragraph if (sourceDoc.doc.p == null) { sourceDoc.doc.p = new CoNLLUP.textDocP[1] { new CoNLLUP.textDocP { id = "autoparagraph_fix", s = sourceDoc.doc.s } }; } else { if (sourceDoc.doc.s != null) { //We seem to have sentences out of paragraphs, create a new paragraph and put them there var tempBuffer = sourceDoc.doc.p; sourceDoc.doc.p = new CoNLLUP.textDocP[sourceDoc.doc.p.Length + 1]; Array.Copy(tempBuffer, sourceDoc.doc.p, tempBuffer.Length); sourceDoc.doc.p[tempBuffer.Length] = new CoNLLUP.textDocP { id = "autoparagraph_fix", s = sourceDoc.doc.s }; } } sourceDoc.doc.s = null; }