/// <exception cref="System.IO.IOException"/> public virtual void Process() { SpanishXMLTreeReaderFactory trf = new SpanishXMLTreeReaderFactory(); Tree t; foreach (File file in fileList) { Reader @in = new BufferedReader(new InputStreamReader(new FileInputStream(file), AncoraEncoding)); ITreeReader tr = trf.NewTreeReader(@in); // Tree reading will implicitly perform tree normalization for us while ((t = tr.ReadTree()) != null) { // Update tagger with this tree IList <CoreLabel> yield = t.TaggedLabeledYield(); foreach (CoreLabel leafLabel in yield) { if (leafLabel.Tag().Equals(SpanishTreeNormalizer.MwTag)) { continue; } unigramTagger.IncrementCount(leafLabel.Word(), leafLabel.Tag()); } } } }
/// <summary> /// Use /// <see cref="Edu.Stanford.Nlp.Trees.International.Spanish.SpanishXMLTreeReader"/> /// to load the trees from the provided files, /// and begin collecting some statistics to be used in later MWE cleanup. /// NB: Much of the important cleanup happens implicitly here; the XML tree reader triggers the /// tree normalization routine. /// </summary> /// <exception cref="System.Exception"/> /// <exception cref="System.IO.IOException"/> /// <exception cref="Java.Util.Concurrent.ExecutionException"/> private IList <Tree> LoadTrees() { bool ner = PropertiesUtils.GetBool(options, "ner", false); string encoding = new SpanishTreebankLanguagePack().GetEncoding(); SpanishXMLTreeReaderFactory trf = new SpanishXMLTreeReaderFactory(true, true, ner, false); IList <Tree> trees = new List <Tree>(); foreach (File file in inputFiles) { Pair <TwoDimensionalCounter <string, string>, IList <Tree> > ret = ProcessTreeFile(file, trf, encoding); Counters.AddInPlace(unigramTagger, ret.First()); Sharpen.Collections.AddAll(trees, ret.Second()); } return(trees); }
/// <summary>Processes a single file containing AnCora XML trees.</summary> /// <remarks> /// Processes a single file containing AnCora XML trees. Returns MWE statistics for the trees in /// the file and the actual parsed trees. /// </remarks> private static Pair <TwoDimensionalCounter <string, string>, IList <Tree> > ProcessTreeFile(File file, SpanishXMLTreeReaderFactory trf, string encoding) { TwoDimensionalCounter <string, string> tagger = new TwoDimensionalCounter <string, string>(); try { Reader @in = new BufferedReader(new InputStreamReader(new FileInputStream(file), encoding)); ITreeReader tr = trf.NewTreeReader(file.GetPath(), @in); IList <Tree> trees = new List <Tree>(); Tree t; Tree splitPoint; while ((t = tr.ReadTree()) != null) { do { // We may need to split the current tree into multiple parts. // (If not, a call to `split` with a `null` split-point is a // no-op splitPoint = FindSplitPoint(t); Pair <Tree, Tree> split = Split(t, splitPoint); Tree toAdd = split.First(); t = split.Second(); trees.Add(toAdd); UpdateTagger(tagger, toAdd); }while (splitPoint != null); } tr.Close(); return(new Pair <TwoDimensionalCounter <string, string>, IList <Tree> >(tagger, trees)); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); return(null); } }