/** * extract adv information from NIH AdvEntry record, and add to a simplenlg * WordElement For now just extract modifier type * * @param wordElement * @param AdvEntry */ private void addAdverbInfo(WordElement wordElement, AdvEntry advEntry) { bool verbModifier = false; bool sentenceModifier = false; bool intensifier = false; List <string> modifications = advEntry.GetModification(); foreach (string modification in modifications) { if (modification.StartsWith("verb_modifier", StringComparison.Ordinal)) { verbModifier = true; } else if (modification.StartsWith("sentence_modifier", StringComparison.Ordinal)) { sentenceModifier = true; } else if (modification.StartsWith("intensifier", StringComparison.Ordinal)) { intensifier = true; } // ignore other modification types } // ignore (for now) other info in record wordElement.setFeature(LexicalFeature.VERB_MODIFIER, verbModifier); wordElement.setFeature(LexicalFeature.SENTENCE_MODIFIER, sentenceModifier); wordElement.setFeature(LexicalFeature.INTENSIFIER, intensifier); }
/** * extract adj information from NIH AdjEntry record, and add to a simplenlg * WordElement For now just extract position info * * @param wordElement * @param AdjEntry */ private void addAdjectiveInfo(WordElement wordElement, AdjEntry adjEntry) { bool qualitativeAdj = false; bool colourAdj = false; bool classifyingAdj = false; bool predicativeAdj = false; List <string> positions = adjEntry.GetPosition(); foreach (string position in positions) { if (position.StartsWith("attrib(1)", StringComparison.Ordinal)) { qualitativeAdj = true; } else if (position.StartsWith("attrib(2)", StringComparison.Ordinal)) { colourAdj = true; } else if (position.StartsWith("attrib(3)", StringComparison.Ordinal)) { classifyingAdj = true; } else if (position.StartsWith("pred", StringComparison.Ordinal)) { predicativeAdj = true; } // ignore other positions } // ignore (for now) other info in record wordElement.setFeature(LexicalFeature.QUALITATIVE, qualitativeAdj); wordElement.setFeature(LexicalFeature.COLOUR, colourAdj); wordElement.setFeature(LexicalFeature.CLASSIFYING, classifyingAdj); wordElement.setFeature(LexicalFeature.PREDICATIVE, predicativeAdj); }
/** * extract noun information from NIH NounEntry record, and add to a * simplenlg WordElement For now just extract whether count/non-count and * whether proper or not * * @param wordElement * @param nounEntry */ private void addNounInfo(WordElement wordElement, NounEntry nounEntry) { bool proper = nounEntry.IsProper(); // bool nonCountVariant = false; // bool regVariant = false; // add the inflectional variants List <string> variants = nounEntry.GetVariants(); if (variants.Count > 0) { IList <Inflection> wordVariants = new List <Inflection>(); foreach (string v in variants) { int index = v.IndexOf("|", StringComparison.Ordinal); string code; if (index > -1) { code = v.Substring(0, index).ToLower().Trim(); } else { code = v.ToLower().Trim(); } Inflection?infl = Inflection.REGULAR.getInflCode(code); if (infl != null) { wordVariants.Add((Inflection)infl); wordElement.addInflectionalVariant((Inflection)infl); } } // if the variants include "reg", this is the default, otherwise just a random pick Inflection defaultVariant = wordVariants.Contains(Inflection.REGULAR) || wordVariants.Count == 0 ? Inflection.REGULAR : wordVariants[0]; wordElement.setFeature(LexicalFeature.DEFAULT_INFL, defaultVariant); wordElement.setDefaultInflectionalVariant(defaultVariant); } // for (String variant : variants) { // if (variant.startsWith("uncount") // || variant.startsWith("groupuncount")) // nonCountVariant = true; // // if (variant.startsWith("reg")) // regVariant = true; // // ignore other variant info // } // lots of words have both "reg" and "unCount", indicating they // can be used in either way. Regard such words as normal, // only flag as nonCount if unambiguous // wordElement.setFeature(LexicalFeature.NON_COUNT, nonCountVariant && !regVariant); wordElement.setFeature(LexicalFeature.PROPER, proper); // ignore (for now) other info in record }
///** // * Extract info about the spelling variants of a word from an NIH record, // * and add to the simplenlg Woordelement. // * // * <P> // * Spelling variants are represented as lists of strings, retrievable via // * {@link LexicalFeature#SPELL_VARS} // * // * @param wordElement // * @param record // */ private void addSpellingVariants(WordElement wordElement, LexRecord record) { List <string> vars = record.GetSpellingVars(); if (vars != null && vars.Count > 0) { wordElement.setFeature(LexicalFeature.SPELL_VARS, vars); } // we set the default spelling var as the baseForm wordElement.setFeature(LexicalFeature.DEFAULT_SPELL, wordElement.BaseForm); }
/** * extract verb information from NIH VerbEntry record, and add to a * simplenlg WordElement For now just extract transitive, instransitive, * and/or ditransitive * * @param wordElement * @param verbEntry */ private void addVerbInfo(WordElement wordElement, VerbEntry verbEntry) { if (verbEntry == null) { // should only happen for aux verbs, which have // auxEntry instead of verbEntry in NIH Lex // just flag as transitive and return wordElement.setFeature(LexicalFeature.INTRANSITIVE, false); wordElement.setFeature(LexicalFeature.TRANSITIVE, true); wordElement.setFeature(LexicalFeature.DITRANSITIVE, false); return; } bool intransitiveVerb = verbEntry.GetIntran().Any(); bool transitiveVerb = verbEntry.GetTran().Any() || verbEntry.GetCplxtran().Any(); bool ditransitiveVerb = verbEntry.GetDitran().Any(); wordElement.setFeature(LexicalFeature.INTRANSITIVE, intransitiveVerb); wordElement.setFeature(LexicalFeature.TRANSITIVE, transitiveVerb); wordElement.setFeature(LexicalFeature.DITRANSITIVE, ditransitiveVerb); // add the inflectional variants List <string> variants = verbEntry.GetVariants(); if (variants.Count > 0) { IList <Inflection> wordVariants = new List <Inflection>(); foreach (string v in variants) { int index = v.IndexOf("|", StringComparison.Ordinal); string code; Inflection?infl; if (index > -1) { code = v.Substring(0, index).ToLower().Trim(); infl = Inflection.REGULAR.getInflCode(code); } else { infl = Inflection.REGULAR.getInflCode(v.ToLower().Trim()); } if (infl != null) { wordElement.addInflectionalVariant((Inflection)infl); wordVariants.Add((Inflection)infl); } } // if the variants include "reg", this is the default, otherwise // just a random pick Inflection defaultVariant = wordVariants.Contains(Inflection.REGULAR) || wordVariants.Count == 0 ? Inflection.REGULAR : wordVariants[0]; // wordElement.setFeature(LexicalFeature.INFLECTIONS, wordVariants); // wordElement.setFeature(LexicalFeature.DEFAULT_INFL, defaultVariant); wordElement.setDefaultInflectionalVariant(defaultVariant); } // ignore (for now) other info in record }
///** // * convenience method to test that a list is not null and not empty // * // * @param list // * @return // */ //private bool notEmpty<T1>(IList<T1> list) //{ // return list != null && list.Count > 0; //} /** * extract information about acronyms from NIH record, and add to a * simplenlg WordElement. * * <P> * Acronyms are represented as lists of word elements. Any acronym will have * a list of full form word elements, retrievable via * {@link LexicalFeature#ACRONYM_OF} * * @param wordElement * @param record */ private void addAcronymInfo(WordElement wordElement, LexRecord record) { // NB: the acronyms are actually the full forms of which the word is an // acronym List <string> acronyms = record.GetAcronyms(); if (acronyms.Count > 0) { // the list of full forms of which this word is an acronym List <NLGElement> acronymOf = wordElement.getFeatureAsElementList(LexicalFeature.ACRONYM_OF); // keep all acronym full forms and set them up as wordElements foreach (string fullForm in acronyms) { if (fullForm.Contains("|")) { // get the acronym id string acronymID = fullForm.SubstringSpecial(fullForm.IndexOf("|", StringComparison.Ordinal) + 1, fullForm.Length); // create the full form element WordElement fullFormWE = getWordByID(acronymID); if (fullForm != null) { // add as full form of this acronym acronymOf.Add(fullFormWE); // List<NLGElement> fullFormAcronyms = fullFormWE // .getFeatureAsElementList(LexicalFeature.ACRONYMS); // fullFormAcronyms.add(wordElement); // fullFormWE.setFeature(LexicalFeature.ACRONYMS, // fullFormAcronyms); } } } // set all the full forms for this acronym wordElement.setFeature(LexicalFeature.ACRONYM_OF, acronymOf); } // if (!acronyms.isEmpty()) { // // String acronym = acronyms.get(0); // // remove anything after a |, this will be an NIH ID // if (acronym.contains("|")) // acronym = acronym.substring(0, acronym.indexOf("|")); // wordElement.setFeature(LexicalFeature.ACRONYM_OF, acronym); // } }
/** * make a WordElement from a lexical record. Currently just specifies basic * params and inflections Should do more in the future! * * @param record * @return */ private WordElement makeWord(LexRecord record) // LexRecord { // get basic data String baseForm = record.GetBase(); LexicalCategory category = record.GetSimpleNLGCategory(record); String id = record.GetEui(); // create word class WordElement wordElement = new WordElement(baseForm, category, id); // now add type information switch (category.GetLexicalCategory()) { case LexicalCategory.LexicalCategoryEnum.ADJECTIVE: addAdjectiveInfo(wordElement, record.GetCatEntry().GetAdjEntry()); break; case LexicalCategory.LexicalCategoryEnum.ADVERB: addAdverbInfo(wordElement, record.GetCatEntry().GetAdvEntry()); break; case LexicalCategory.LexicalCategoryEnum.NOUN: addNounInfo(wordElement, record.GetCatEntry().GetNounEntry()); break; case LexicalCategory.LexicalCategoryEnum.VERB: addVerbInfo(wordElement, record.GetCatEntry().GetVerbEntry()); break; // ignore closed class words } Inflection?defaultInfl = wordElement.getDefaultInflectionalVariant(); // now add inflected forms // if (keepStandardInflections || !standardInflections(record, // category)) { foreach (InflVar inflection in record.GetInflVarsAndAgreements().GetInflValues()) { String simplenlgInflection = getSimplenlgInflection(inflection .GetInflection()); if (simplenlgInflection != null) { String inflectedForm = inflection.GetVar(); Inflection?inflType = Inflection.REGULAR.getInflCode(inflection.GetType()); // store all inflectional variants, except for regular ones // unless explicitly set if (inflType != null && !(Inflection.REGULAR == inflType && !keepStandardInflections)) { wordElement.addInflectionalVariant((Inflection)inflType, simplenlgInflection, inflectedForm); } // if the infl variant is the default, also set this feature on // the word if (defaultInfl == null || (defaultInfl.Equals(inflType) && !(Inflection.REGULAR.Equals(inflType) && !keepStandardInflections))) { wordElement.setFeature(simplenlgInflection, inflectedForm); } // wordElement // .setFeature(simplenlgInflection, inflection.GetVar()); } } // } // add acronym info addAcronymInfo(wordElement, record); // now add spelling variants addSpellingVariants(wordElement, record); return(wordElement); }
/** * create a simplenlg WordElement from a Word node in a lexicon XML file * * @param wordNode * @return * @throws XPathUtilException */ private WordElement convertNodeToWord(XmlNode wordNode) { // if this isn't a Word node, ignore it if (!wordNode.Name.Equals(XML_WORD, StringComparison.CurrentCultureIgnoreCase)) { return(null); } // if there is no base, flag an error and return null // String base = XPathUtil.extractValue(wordNode, Constants.XML_BASE); // if (base == null) { // System.out.println("Error in loading XML lexicon: Word with no base"); // return null; // } // create word WordElement word = new WordElement(); IList <Inflection> inflections = new List <Inflection>(); // now copy features XmlNodeList nodes = wordNode.ChildNodes; for (int i = 0; i < nodes.Count; i++) { XmlNode featureNode = nodes.Item(i); if (featureNode.NodeType == XmlNodeType.Element) { string feature = featureNode.Name.Trim(); string value = featureNode.InnerText; if (!ReferenceEquals(value, null)) { value = value.Trim(); } if (ReferenceEquals(feature, null)) { Console.Error.WriteLine("Error in XML lexicon node for " + word.ToString()); break; } if (feature.Equals(XML_BASE, StringComparison.OrdinalIgnoreCase)) { word.BaseForm = value; } else if (feature.Equals(XML_CATEGORY, StringComparison.OrdinalIgnoreCase)) { Enum.TryParse(value.ToUpper(), out LexicalCategory.LexicalCategoryEnum lexcat); word.Category = new LexicalCategory(lexcat); } else if (feature.Equals(XML_ID, StringComparison.OrdinalIgnoreCase)) { word.Id = value; } else if (ReferenceEquals(value, null) || value.Equals("")) { // if this is an infl code, add it to inflections Inflection?infl = Inflection.REGULAR.getInflCode(feature); if (infl != null) { inflections.Add((Inflection)infl); } else { // otherwise assume it's a boolean feature word.setFeature(feature, true); } } else { word.setFeature(feature, value); } } } // if no infl specified, assume regular if (inflections.Count == 0) { inflections.Add(Inflection.REGULAR); } // default inflection code is "reg" if we have it, else random pick form infl codes available Inflection defaultInfl = inflections.Contains(Inflection.REGULAR) ? Inflection.REGULAR : inflections[0]; word.setFeature(LexicalFeature.DEFAULT_INFL, defaultInfl); word.setDefaultInflectionalVariant(defaultInfl); foreach (Inflection infl in inflections) { word.addInflectionalVariant(infl); } // done, return word return(word); }