private void ParseDocumentCoNLLUP(string fileName, LegalTextParserFactory parserFactory)
        {
            //Convert to an XML representation
            CoNLLDocument sourceDoc = null;

            //Parse the same way as XML
            ParseDocumentXml(sourceDoc, parserFactory);
        }
 public ParsedDocument(string fileName, LegalTextParserFactory parserFactory, bool isXml = true)
 {
     if (isXml)
     {
         ParseDocumentXml(fileName, parserFactory);
     }
     else
     {
         ParseDocumentCoNLLUP(fileName, parserFactory);
     }
 }
Beispiel #3
0
        public CorpusIndexManager(IndexingMode indexingMode, string rootDirectory, string indexDirectory, string tbxIateLocation = null, string logLocation = null, int?parallelIndexingLimit = null, bool readOnly = true)
        {
            m_readOnly     = readOnly;
            m_indexingMode = indexingMode;

            if (!Directory.Exists(rootDirectory))
            {
                throw new ArgumentException("The source directory must exist!");
            }

            /*if (Directory.Exists(indexDirectory) && !Directory.EnumerateFileSystemEntries(indexDirectory).GetEnumerator().MoveNext())
             * {
             *  throw new ArgumentException("The indexing directory must be empty!");
             * }*/

            m_rootSourceDirectory = rootDirectory;

            if (logLocation != null)
            {
                m_logLocation = logLocation;
                if (!m_logLocation.EndsWith(Path.DirectorySeparatorChar.ToString()))
                {
                    m_logLocation += Path.DirectorySeparatorChar;
                }
                if (!Directory.Exists(m_logLocation))
                {
                    Directory.CreateDirectory(m_logLocation);
                }
            }

            m_rootIndexDirectory = indexDirectory;
            if (!m_rootIndexDirectory.EndsWith(Path.DirectorySeparatorChar.ToString()))
            {
                m_rootIndexDirectory += Path.DirectorySeparatorChar;
            }
            if (!Directory.Exists(m_rootIndexDirectory))
            {
                Directory.CreateDirectory(m_rootIndexDirectory);
            }

            if (tbxIateLocation != null)
            {
                Tbx sourceTbx     = null;
                var serializerTbx = new System.Xml.Serialization.XmlSerializer(typeof(Tbx));
                using (var xmlReader = XmlReader.Create(tbxIateLocation))
                {
                    sourceTbx = (Tbx)serializerTbx.Deserialize(xmlReader);
                }

                m_parserFactory = new LegalTextParserFactory(sourceTbx);
            }

            SetupWriters();
        }
        private void ParseDocumentXml(CoNLLDocument sourceDoc, LegalTextParserFactory parserFactory)
        {
            if (sourceDoc == null || sourceDoc.doc == null)
            {
                throw new InvalidOperationException("Unsupported document found in corpus!");
            }

            var parser = parserFactory.CreateParser(sourceDoc.doc.language);

            var pass1 = parser.ParsePass1(sourceDoc);

            parser.ParsePass2(pass1);

            m_marcellDocument = pass1;
        }
        private void ParseDocumentXml(string fileName, LegalTextParserFactory parserFactory)
        {
            CoNLLDocument sourceDoc = null;

            using (var fileReader = new SanitizedStreamReader(fileName))
            {
                using (var xmlReader = XmlReader.Create(fileReader))
                {
                    sourceDoc = (CoNLLDocument)SerializerCoNLLUP.Deserialize(xmlReader);
                }
            }

            if (sourceDoc == null || sourceDoc.doc == null)
            {
                throw new InvalidOperationException("Unsupported document found in corpus!");
            }

            //Preprocess the document to fix common problems in xml files
            Preparse(sourceDoc, fileName);

            ParseDocumentXml(sourceDoc, parserFactory);
        }