/// <summary>Read parse trees from a Reader.</summary>
        /// <param name="filename"/>
        /// <param name="in">
        /// The
        /// <c>Reader</c>
        /// </param>
        /// <param name="simplifiedTagset">
        /// If `true`, convert part-of-speech labels to a
        /// simplified version of the EAGLES tagset, where the tags do not
        /// include extensive morphological analysis
        /// </param>
        /// <param name="aggressiveNormalization">
        /// Perform aggressive "normalization"
        /// on the trees read from the provided corpus documents:
        /// split multi-word tokens into their constituent words (and
        /// infer parts of speech of the constituent words).
        /// </param>
        /// <param name="retainNER">
        /// Retain NER information in preterminals (for later
        /// use in `MultiWordPreprocessor) and add NER-specific
        /// parents to single-word NE tokens
        /// </param>
        /// <param name="detailedAnnotations">
        /// Retain detailed tree node annotations. These
        /// annotations on parse tree constituents may be useful for
        /// e.g. training a parser.
        /// </param>
        public SpanishXMLTreeReader(string filename, Reader @in, bool simplifiedTagset, bool aggressiveNormalization, bool retainNER, bool detailedAnnotations)
        {
            // Constituent annotations
            ITreebankLanguagePack tlp = new SpanishTreebankLanguagePack();

            this.simplifiedTagset    = simplifiedTagset;
            this.detailedAnnotations = detailedAnnotations;
            stream         = new ReaderInputStream(@in, tlp.GetEncoding());
            treeFactory    = new LabeledScoredTreeFactory();
            treeNormalizer = new SpanishTreeNormalizer(simplifiedTagset, aggressiveNormalization, retainNER);
            DocumentBuilder parser = XMLUtils.GetXmlParser();

            try
            {
                IDocument xml  = parser.Parse(stream);
                IElement  root = xml.GetDocumentElement();
                sentences = root.GetElementsByTagName(NodeSent);
                sentIdx   = 0;
            }
            catch (SAXException e)
            {
                log.Info("Parse exception while reading " + filename);
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
 public MultiWordProcessor(AnCoraProcessor _enclosing, IFactory <TreeNormalizer> tnf, ITreeFactory tf, bool ner)
 {
     this._enclosing = _enclosing;
     // NB: TreeNormalizer is not thread-safe, and so we need to accept + store a
     // TreeNormalizer factory instead
     this.tnf = tnf;
     this.tn  = tnf.Create();
     this.tf  = tf;
     this.ner = ner;
 }
 private TaggedFileRecord(string file, TaggedFileRecord.Format format, string encoding, string tagSeparator, ITreeTransformer treeTransformer, TreeNormalizer treeNormalizer, ITreeReaderFactory trf, NumberRangesFileFilter treeRange, IPredicate
                          <Tree> treeFilter, int wordColumn, int tagColumn)
 {
     // represents a tokenized file separated by text
     // represents a tsv file such as a conll file
     // represents a file in PTB format
     this.file            = file;
     this.format          = format;
     this.encoding        = encoding;
     this.tagSeparator    = tagSeparator;
     this.treeTransformer = treeTransformer;
     this.treeNormalizer  = treeNormalizer;
     this.treeRange       = treeRange;
     this.treeFilter      = treeFilter;
     this.wordColumn      = wordColumn;
     this.tagColumn       = tagColumn;
     this.trf             = trf;
 }
 public TreeTaggedFileReader(TaggedFileRecord record)
 {
     // int numSentences = 0;
     filename    = record.file;
     trf         = record.trf == null ? new LabeledScoredTreeReaderFactory() : record.trf;
     transformer = record.treeTransformer;
     normalizer  = record.treeNormalizer;
     treeFilter  = record.treeFilter;
     treebank    = new DiskTreebank(trf, record.encoding);
     if (record.treeRange != null)
     {
         treebank.LoadPath(filename, record.treeRange);
     }
     else
     {
         treebank.LoadPath(filename);
     }
     treeIterator = treebank.GetEnumerator();
     FindNext();
 }
Ejemplo n.º 5
0
        // Nested nominal group containing period punctuation
        // Match boundaries for subtrees created
        // Match candidate preposition
        // Headed by a group that was generated from
        // multi-word token expansion and that we
        // wish to expand further
        // With an NP on the left (-> this is a
        // prep. phrase) and not preceded by any
        // other prepositions
        // Match candidate preposition
        // Which is the first child in a group that
        // was generated from multi-word token
        // expansion and that we wish to expand
        // further
        // With an NP on the left (-> this is a
        // prep. phrase) and not preceded by any
        // other prepositions
        // In one of our expanded phrases (match
        // bounds of this expanded phrase; these form
        // the left edge of first new subtree and the
        // right edge of the second new subtree)
        // Fetch more bounds: node to immediate left
        // of cc is the right edge of the first new
        // subtree, and node to right of cc is the
        // left edge of the second new subtree
        //
        // NB: left1 may the same as right1; likewise
        // for the second tree
        // "en opinion del X," "además del Y"
        // -> "(en opinion de) (el X)," "(además de) (el Y)"
        // "del X al Y"
        // ---------
        // Final cleanup operations
        // Should be first-ish
        // Should not happen until the last moment! The function words
        // being targeted have weaker "scope" than others earlier
        // targeted, and so we don't want to clump things around them
        // until we know we have the right to clump
        // Verb phrase-related cleanup.. order is important!
        // Fixes for specific common phrases
        // Lastly..
        //
        // These final fixes are not at all linguistically motivated -- just need to make the trees less dirty
        /// <summary>
        /// Recognize candidate patterns for expansion in the given tree and
        /// perform the expansions.
        /// </summary>
        /// <remarks>
        /// Recognize candidate patterns for expansion in the given tree and
        /// perform the expansions. See the class documentation for more
        /// information.
        /// </remarks>
        public virtual Tree ExpandPhrases(Tree t, TreeNormalizer tn, ITreeFactory tf)
        {
            // Keep running this sequence of patterns until no changes are
            // affected. We need this for nested expressions like "para tratar
            // de regresar al empleo." This first step produces lots of
            // "intermediate" tree structures which need to be cleaned up later.
            Tree oldTree;

            do
            {
                oldTree = t.DeepCopy();
                t       = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ProcessPatternsOnTree(firstStepExpansions, t);
            }while (!t.Equals(oldTree));
            // Now clean up intermediate tree structures
            t = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ProcessPatternsOnTree(intermediateExpansions, t);
            // Normalize first to allow for contraction expansion, etc.
            t = tn.NormalizeWholeTree(t, tf);
            // Final cleanup
            t = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ProcessPatternsOnTree(finalCleanup, t);
            return(t);
        }
        /// <summary>Read parse trees from a Reader.</summary>
        /// <param name="in">Reader</param>
        /// <param name="tf">TreeFactory -- factory to create some kind of Tree</param>
        /// <param name="tn">the method of normalizing trees</param>
        public FrenchXMLTreeReader(Reader @in, ITreeFactory tf, TreeNormalizer tn)
        {
            // Prefix for MWE nodes
            ITreebankLanguagePack tlp = new FrenchTreebankLanguagePack();

            stream         = new ReaderInputStream(@in, tlp.GetEncoding());
            treeFactory    = tf;
            treeNormalizer = tn;
            DocumentBuilder parser = XMLUtils.GetXmlParser();

            try
            {
                IDocument xml  = parser.Parse(stream);
                IElement  root = xml.GetDocumentElement();
                sentences = root.GetElementsByTagName(NodeSent);
                sentIdx   = 0;
            }
            catch (Exception e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
Ejemplo n.º 7
0
 public CTBTreeReaderFactory(TreeNormalizer tn, bool discardFrags)
 {
     this.tn           = tn;
     this.discardFrags = discardFrags;
 }
Ejemplo n.º 8
0
 public CTBTreeReaderFactory(TreeNormalizer tn)
     : this(tn, false)
 {
 }
 /// <summary>
 /// Specify your own
 /// <see cref="ITreeFactory"/>
 /// ;
 /// uses a
 /// <see cref="PennTreebankTokenizer"/>
 /// , and a
 /// <see cref="TreeNormalizer"/>
 /// .
 /// </summary>
 /// <param name="tf">The TreeFactory to use in building Tree objects to return.</param>
 /// <param name="tn">The TreeNormalizer to use</param>
 public PennTreeReaderFactory(ITreeFactory tf, TreeNormalizer tn)
 {
     this.tf = tf;
     this.tn = tn;
 }
 /// <summary>
 /// Specify your own
 /// <see cref="TreeNormalizer"/>
 /// ;
 /// uses a
 /// <see cref="PennTreebankTokenizer"/>
 /// , and a
 /// <see cref="LabeledScoredTreeFactory"/>
 /// .
 /// </summary>
 /// <param name="tn">The TreeNormalizer to use in building Tree objects to return.</param>
 public PennTreeReaderFactory(TreeNormalizer tn)
     : this(new LabeledScoredTreeFactory(), tn)
 {
 }
        public static Edu.Stanford.Nlp.Tagger.IO.TaggedFileRecord CreateRecord(Properties config, string description)
        {
            string[] pieces = description.Split(",");
            if (pieces.Length == 1)
            {
                return(new Edu.Stanford.Nlp.Tagger.IO.TaggedFileRecord(description, TaggedFileRecord.Format.Text, GetEncoding(config), GetTagSeparator(config), null, null, null, null, null, null, null));
            }
            string[] args = new string[pieces.Length - 1];
            System.Array.Copy(pieces, 0, args, 0, pieces.Length - 1);
            string file = pieces[pieces.Length - 1];

            TaggedFileRecord.Format format         = TaggedFileRecord.Format.Text;
            string                 encoding        = GetEncoding(config);
            string                 tagSeparator    = GetTagSeparator(config);
            ITreeTransformer       treeTransformer = null;
            TreeNormalizer         treeNormalizer  = null;
            ITreeReaderFactory     trf             = null;
            NumberRangesFileFilter treeRange       = null;
            IPredicate <Tree>      treeFilter      = null;
            int wordColumn = null;
            int tagColumn  = null;

            foreach (string arg in args)
            {
                string[] argPieces = arg.Split("=", 2);
                if (argPieces.Length != 2)
                {
                    throw new ArgumentException("TaggedFileRecord argument " + arg + " has an unexpected number of =s");
                }
                if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], Format))
                {
                    format = TaggedFileRecord.Format.ValueOf(argPieces[1]);
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], Encoding))
                    {
                        encoding = argPieces[1];
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TagSeparator))
                        {
                            tagSeparator = argPieces[1];
                        }
                        else
                        {
                            if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TreeTransformer))
                            {
                                treeTransformer = ReflectionLoading.LoadByReflection(argPieces[1]);
                            }
                            else
                            {
                                if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TreeNormalizer))
                                {
                                    treeNormalizer = ReflectionLoading.LoadByReflection(argPieces[1]);
                                }
                                else
                                {
                                    if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TreeReader))
                                    {
                                        trf = ReflectionLoading.LoadByReflection(argPieces[1]);
                                    }
                                    else
                                    {
                                        if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TreeRange))
                                        {
                                            string range = argPieces[1].ReplaceAll(":", ",");
                                            treeRange = new NumberRangesFileFilter(range, true);
                                        }
                                        else
                                        {
                                            if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TreeFilter))
                                            {
                                                treeFilter = ReflectionLoading.LoadByReflection(argPieces[1]);
                                            }
                                            else
                                            {
                                                if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], WordColumn))
                                                {
                                                    wordColumn = int.Parse(argPieces[1]);
                                                }
                                                else
                                                {
                                                    if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TagColumn))
                                                    {
                                                        tagColumn = int.Parse(argPieces[1]);
                                                    }
                                                    else
                                                    {
                                                        throw new ArgumentException("TaggedFileRecord argument " + argPieces[0] + " is unknown");
                                                    }
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
            return(new Edu.Stanford.Nlp.Tagger.IO.TaggedFileRecord(file, format, encoding, tagSeparator, treeTransformer, treeNormalizer, trf, treeRange, treeFilter, wordColumn, tagColumn));
        }
Ejemplo n.º 12
0
        private static void ResolveDummyTags(File treeFile, TwoDimensionalCounter <string, string> unigramTagger, bool retainNER, TreeNormalizer tn)
        {
            ITreeFactory          tf       = new LabeledScoredTreeFactory();
            MultiWordTreeExpander expander = new MultiWordTreeExpander();

            try
            {
                BufferedReader     br  = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
                ITreeReaderFactory trf = new SpanishTreeReaderFactory();
                ITreeReader        tr  = trf.NewTreeReader(br);
                PrintWriter        pw  = new PrintWriter(new TextWriter(new FileOutputStream(new File(treeFile + ".fixed")), false, "UTF-8"));
                int nTrees             = 0;
                for (Tree t; (t = tr.ReadTree()) != null; nTrees++)
                {
                    TraverseAndFix(t, null, unigramTagger, retainNER);
                    // Now "decompress" further the expanded trees formed by
                    // multiword token splitting
                    t = expander.ExpandPhrases(t, tn, tf);
                    if (tn != null)
                    {
                        t = tn.NormalizeWholeTree(t, tf);
                    }
                    pw.Println(t.ToString());
                }
                pw.Close();
                tr.Close();
                System.Console.Out.WriteLine("Processed " + nTrees + " trees");
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
 public FragDiscardingPennTreeReader(Reader @in, ITreeFactory tf, TreeNormalizer tn, ITokenizer <string> tk)
     : base(@in, tf, tn, tk)
 {
 }