public ArabicRawTreeNormalizer(ATBArabicDataset _enclosing, PrintWriter outFile, PrintWriter flatFile) { this._enclosing = _enclosing; this.encodingMap = (this._enclosing.encoding == Dataset.Encoding.Utf8) ? new Buckwalter() : new Buckwalter(true); this.outfile = outFile; this.flatFile = flatFile; this.nullFilter = new ArabicTreeNormalizer.ArabicEmptyFilter(); this.aOverAFilter = new BobChrisTreeNormalizer.AOverAFilter(); this.tf = new LabeledScoredTreeFactory(); this.tlp = new ArabicTreebankLanguagePack(); }
public DefaultLexicalMapper() { //Buckwalter patterns //U+0627 //TODO Extend coverage to entire Arabic code chart //Obviously Buckwalter is a lossful conversion, but no assumptions should be made about //UTF-8 input from "the wild" //Patterns to fix segmentation issues observed in the ATB // Process the vocalized section for parsing // Strip morpheme boundary markers in the vocalized section // Strip all morpheme and segmentation markers in UTF-8 Arabic //wsg: "LATIN" does not appear in the Bies tagset, so be sure to pass //in the extended POS tags during normalization // private final Set<String> utf8Clitics; parentTagsToEscape = Java.Util.Collections.UnmodifiableSet(Generics.NewHashSet(Arrays.AsList(parentTagString.Split("\\s+")))); // utf8Clitics = // Collections.unmodifiableSet(Generics.newHashSet(Arrays.asList(utf8CliticString.split("\\s+")))); Buckwalter bw = new Buckwalter(true); string bwString = bw.Apply(utf8CliticString); bwClitics = Java.Util.Collections.UnmodifiableSet(Generics.NewHashSet(Arrays.AsList(bwString.Split("\\s+")))); }