public DefaultLexicalMapper() { //Buckwalter patterns //U+0627 //TODO Extend coverage to entire Arabic code chart //Obviously Buckwalter is a lossful conversion, but no assumptions should be made about //UTF-8 input from "the wild" //Patterns to fix segmentation issues observed in the ATB // Process the vocalized section for parsing // Strip morpheme boundary markers in the vocalized section // Strip all morpheme and segmentation markers in UTF-8 Arabic //wsg: "LATIN" does not appear in the Bies tagset, so be sure to pass //in the extended POS tags during normalization // private final Set<String> utf8Clitics; parentTagsToEscape = Java.Util.Collections.UnmodifiableSet(Generics.NewHashSet(Arrays.AsList(parentTagString.Split("\\s+")))); // utf8Clitics = // Collections.unmodifiableSet(Generics.newHashSet(Arrays.asList(utf8CliticString.split("\\s+")))); Buckwalter bw = new Buckwalter(true); string bwString = bw.Apply(utf8CliticString); bwClitics = Java.Util.Collections.UnmodifiableSet(Generics.NewHashSet(Arrays.AsList(bwString.Split("\\s+")))); }