public ArabicRawTreeNormalizer(ATBArabicDataset _enclosing, PrintWriter outFile, PrintWriter flatFile)
 {
     this._enclosing   = _enclosing;
     this.encodingMap  = (this._enclosing.encoding == Dataset.Encoding.Utf8) ? new Buckwalter() : new Buckwalter(true);
     this.outfile      = outFile;
     this.flatFile     = flatFile;
     this.nullFilter   = new ArabicTreeNormalizer.ArabicEmptyFilter();
     this.aOverAFilter = new BobChrisTreeNormalizer.AOverAFilter();
     this.tf           = new LabeledScoredTreeFactory();
     this.tlp          = new ArabicTreebankLanguagePack();
 }
Пример #2
0
        public DefaultLexicalMapper()
        {
            //Buckwalter patterns
            //U+0627
            //TODO Extend coverage to entire Arabic code chart
            //Obviously Buckwalter is a lossful conversion, but no assumptions should be made about
            //UTF-8 input from "the wild"
            //Patterns to fix segmentation issues observed in the ATB
            // Process the vocalized section for parsing
            // Strip morpheme boundary markers in the vocalized section
            // Strip all morpheme and segmentation markers in UTF-8 Arabic
            //wsg: "LATIN" does not appear in the Bies tagset, so be sure to pass
            //in the extended POS tags during normalization
            //  private final Set<String> utf8Clitics;
            parentTagsToEscape = Java.Util.Collections.UnmodifiableSet(Generics.NewHashSet(Arrays.AsList(parentTagString.Split("\\s+"))));
            //    utf8Clitics =
            //      Collections.unmodifiableSet(Generics.newHashSet(Arrays.asList(utf8CliticString.split("\\s+"))));
            Buckwalter bw       = new Buckwalter(true);
            string     bwString = bw.Apply(utf8CliticString);

            bwClitics = Java.Util.Collections.UnmodifiableSet(Generics.NewHashSet(Arrays.AsList(bwString.Split("\\s+"))));
        }