/// <summary>Read parse trees from a Reader.</summary> /// <param name="filename"/> /// <param name="in"> /// The /// <c>Reader</c> /// </param> /// <param name="simplifiedTagset"> /// If `true`, convert part-of-speech labels to a /// simplified version of the EAGLES tagset, where the tags do not /// include extensive morphological analysis /// </param> /// <param name="aggressiveNormalization"> /// Perform aggressive "normalization" /// on the trees read from the provided corpus documents: /// split multi-word tokens into their constituent words (and /// infer parts of speech of the constituent words). /// </param> /// <param name="retainNER"> /// Retain NER information in preterminals (for later /// use in `MultiWordPreprocessor) and add NER-specific /// parents to single-word NE tokens /// </param> /// <param name="detailedAnnotations"> /// Retain detailed tree node annotations. These /// annotations on parse tree constituents may be useful for /// e.g. training a parser. /// </param> public SpanishXMLTreeReader(string filename, Reader @in, bool simplifiedTagset, bool aggressiveNormalization, bool retainNER, bool detailedAnnotations) { // Constituent annotations ITreebankLanguagePack tlp = new SpanishTreebankLanguagePack(); this.simplifiedTagset = simplifiedTagset; this.detailedAnnotations = detailedAnnotations; stream = new ReaderInputStream(@in, tlp.GetEncoding()); treeFactory = new LabeledScoredTreeFactory(); treeNormalizer = new SpanishTreeNormalizer(simplifiedTagset, aggressiveNormalization, retainNER); DocumentBuilder parser = XMLUtils.GetXmlParser(); try { IDocument xml = parser.Parse(stream); IElement root = xml.GetDocumentElement(); sentences = root.GetElementsByTagName(NodeSent); sentIdx = 0; } catch (SAXException e) { log.Info("Parse exception while reading " + filename); Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
public MultiWordProcessor(AnCoraProcessor _enclosing, IFactory <TreeNormalizer> tnf, ITreeFactory tf, bool ner) { this._enclosing = _enclosing; // NB: TreeNormalizer is not thread-safe, and so we need to accept + store a // TreeNormalizer factory instead this.tnf = tnf; this.tn = tnf.Create(); this.tf = tf; this.ner = ner; }
private TaggedFileRecord(string file, TaggedFileRecord.Format format, string encoding, string tagSeparator, ITreeTransformer treeTransformer, TreeNormalizer treeNormalizer, ITreeReaderFactory trf, NumberRangesFileFilter treeRange, IPredicate <Tree> treeFilter, int wordColumn, int tagColumn) { // represents a tokenized file separated by text // represents a tsv file such as a conll file // represents a file in PTB format this.file = file; this.format = format; this.encoding = encoding; this.tagSeparator = tagSeparator; this.treeTransformer = treeTransformer; this.treeNormalizer = treeNormalizer; this.treeRange = treeRange; this.treeFilter = treeFilter; this.wordColumn = wordColumn; this.tagColumn = tagColumn; this.trf = trf; }
public TreeTaggedFileReader(TaggedFileRecord record) { // int numSentences = 0; filename = record.file; trf = record.trf == null ? new LabeledScoredTreeReaderFactory() : record.trf; transformer = record.treeTransformer; normalizer = record.treeNormalizer; treeFilter = record.treeFilter; treebank = new DiskTreebank(trf, record.encoding); if (record.treeRange != null) { treebank.LoadPath(filename, record.treeRange); } else { treebank.LoadPath(filename); } treeIterator = treebank.GetEnumerator(); FindNext(); }
// Nested nominal group containing period punctuation // Match boundaries for subtrees created // Match candidate preposition // Headed by a group that was generated from // multi-word token expansion and that we // wish to expand further // With an NP on the left (-> this is a // prep. phrase) and not preceded by any // other prepositions // Match candidate preposition // Which is the first child in a group that // was generated from multi-word token // expansion and that we wish to expand // further // With an NP on the left (-> this is a // prep. phrase) and not preceded by any // other prepositions // In one of our expanded phrases (match // bounds of this expanded phrase; these form // the left edge of first new subtree and the // right edge of the second new subtree) // Fetch more bounds: node to immediate left // of cc is the right edge of the first new // subtree, and node to right of cc is the // left edge of the second new subtree // // NB: left1 may the same as right1; likewise // for the second tree // "en opinion del X," "además del Y" // -> "(en opinion de) (el X)," "(además de) (el Y)" // "del X al Y" // --------- // Final cleanup operations // Should be first-ish // Should not happen until the last moment! The function words // being targeted have weaker "scope" than others earlier // targeted, and so we don't want to clump things around them // until we know we have the right to clump // Verb phrase-related cleanup.. order is important! // Fixes for specific common phrases // Lastly.. // // These final fixes are not at all linguistically motivated -- just need to make the trees less dirty /// <summary> /// Recognize candidate patterns for expansion in the given tree and /// perform the expansions. /// </summary> /// <remarks> /// Recognize candidate patterns for expansion in the given tree and /// perform the expansions. See the class documentation for more /// information. /// </remarks> public virtual Tree ExpandPhrases(Tree t, TreeNormalizer tn, ITreeFactory tf) { // Keep running this sequence of patterns until no changes are // affected. We need this for nested expressions like "para tratar // de regresar al empleo." This first step produces lots of // "intermediate" tree structures which need to be cleaned up later. Tree oldTree; do { oldTree = t.DeepCopy(); t = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ProcessPatternsOnTree(firstStepExpansions, t); }while (!t.Equals(oldTree)); // Now clean up intermediate tree structures t = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ProcessPatternsOnTree(intermediateExpansions, t); // Normalize first to allow for contraction expansion, etc. t = tn.NormalizeWholeTree(t, tf); // Final cleanup t = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ProcessPatternsOnTree(finalCleanup, t); return(t); }
/// <summary>Read parse trees from a Reader.</summary> /// <param name="in">Reader</param> /// <param name="tf">TreeFactory -- factory to create some kind of Tree</param> /// <param name="tn">the method of normalizing trees</param> public FrenchXMLTreeReader(Reader @in, ITreeFactory tf, TreeNormalizer tn) { // Prefix for MWE nodes ITreebankLanguagePack tlp = new FrenchTreebankLanguagePack(); stream = new ReaderInputStream(@in, tlp.GetEncoding()); treeFactory = tf; treeNormalizer = tn; DocumentBuilder parser = XMLUtils.GetXmlParser(); try { IDocument xml = parser.Parse(stream); IElement root = xml.GetDocumentElement(); sentences = root.GetElementsByTagName(NodeSent); sentIdx = 0; } catch (Exception e) { Sharpen.Runtime.PrintStackTrace(e); } }
public CTBTreeReaderFactory(TreeNormalizer tn, bool discardFrags) { this.tn = tn; this.discardFrags = discardFrags; }
public CTBTreeReaderFactory(TreeNormalizer tn) : this(tn, false) { }
/// <summary> /// Specify your own /// <see cref="ITreeFactory"/> /// ; /// uses a /// <see cref="PennTreebankTokenizer"/> /// , and a /// <see cref="TreeNormalizer"/> /// . /// </summary> /// <param name="tf">The TreeFactory to use in building Tree objects to return.</param> /// <param name="tn">The TreeNormalizer to use</param> public PennTreeReaderFactory(ITreeFactory tf, TreeNormalizer tn) { this.tf = tf; this.tn = tn; }
/// <summary> /// Specify your own /// <see cref="TreeNormalizer"/> /// ; /// uses a /// <see cref="PennTreebankTokenizer"/> /// , and a /// <see cref="LabeledScoredTreeFactory"/> /// . /// </summary> /// <param name="tn">The TreeNormalizer to use in building Tree objects to return.</param> public PennTreeReaderFactory(TreeNormalizer tn) : this(new LabeledScoredTreeFactory(), tn) { }
public static Edu.Stanford.Nlp.Tagger.IO.TaggedFileRecord CreateRecord(Properties config, string description) { string[] pieces = description.Split(","); if (pieces.Length == 1) { return(new Edu.Stanford.Nlp.Tagger.IO.TaggedFileRecord(description, TaggedFileRecord.Format.Text, GetEncoding(config), GetTagSeparator(config), null, null, null, null, null, null, null)); } string[] args = new string[pieces.Length - 1]; System.Array.Copy(pieces, 0, args, 0, pieces.Length - 1); string file = pieces[pieces.Length - 1]; TaggedFileRecord.Format format = TaggedFileRecord.Format.Text; string encoding = GetEncoding(config); string tagSeparator = GetTagSeparator(config); ITreeTransformer treeTransformer = null; TreeNormalizer treeNormalizer = null; ITreeReaderFactory trf = null; NumberRangesFileFilter treeRange = null; IPredicate <Tree> treeFilter = null; int wordColumn = null; int tagColumn = null; foreach (string arg in args) { string[] argPieces = arg.Split("=", 2); if (argPieces.Length != 2) { throw new ArgumentException("TaggedFileRecord argument " + arg + " has an unexpected number of =s"); } if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], Format)) { format = TaggedFileRecord.Format.ValueOf(argPieces[1]); } else { if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], Encoding)) { encoding = argPieces[1]; } else { if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TagSeparator)) { tagSeparator = argPieces[1]; } else { if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TreeTransformer)) { treeTransformer = ReflectionLoading.LoadByReflection(argPieces[1]); } else { if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TreeNormalizer)) { treeNormalizer = ReflectionLoading.LoadByReflection(argPieces[1]); } else { if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TreeReader)) { trf = ReflectionLoading.LoadByReflection(argPieces[1]); } else { if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TreeRange)) { string range = argPieces[1].ReplaceAll(":", ","); treeRange = new NumberRangesFileFilter(range, true); } else { if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TreeFilter)) { treeFilter = ReflectionLoading.LoadByReflection(argPieces[1]); } else { if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], WordColumn)) { wordColumn = int.Parse(argPieces[1]); } else { if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TagColumn)) { tagColumn = int.Parse(argPieces[1]); } else { throw new ArgumentException("TaggedFileRecord argument " + argPieces[0] + " is unknown"); } } } } } } } } } } } return(new Edu.Stanford.Nlp.Tagger.IO.TaggedFileRecord(file, format, encoding, tagSeparator, treeTransformer, treeNormalizer, trf, treeRange, treeFilter, wordColumn, tagColumn)); }
private static void ResolveDummyTags(File treeFile, TwoDimensionalCounter <string, string> unigramTagger, bool retainNER, TreeNormalizer tn) { ITreeFactory tf = new LabeledScoredTreeFactory(); MultiWordTreeExpander expander = new MultiWordTreeExpander(); try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")); ITreeReaderFactory trf = new SpanishTreeReaderFactory(); ITreeReader tr = trf.NewTreeReader(br); PrintWriter pw = new PrintWriter(new TextWriter(new FileOutputStream(new File(treeFile + ".fixed")), false, "UTF-8")); int nTrees = 0; for (Tree t; (t = tr.ReadTree()) != null; nTrees++) { TraverseAndFix(t, null, unigramTagger, retainNER); // Now "decompress" further the expanded trees formed by // multiword token splitting t = expander.ExpandPhrases(t, tn, tf); if (tn != null) { t = tn.NormalizeWholeTree(t, tf); } pw.Println(t.ToString()); } pw.Close(); tr.Close(); System.Console.Out.WriteLine("Processed " + nTrees + " trees"); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
public FragDiscardingPennTreeReader(Reader @in, ITreeFactory tf, TreeNormalizer tn, ITokenizer <string> tk) : base(@in, tf, tn, tk) { }