//public GazetteerTerm(IEnumerable<string> tokens, IEnumerable<string> posConstraints, bool lemmatize, CaseMatchingType caseMatchingType, bool enabled, Gazetteer gazetteer) //{ // InitializeInstance(tokens, posConstraints, lemmatize, caseMatchingType, enabled, gazetteer); //} public GazetteerTerm(string termDef, Gazetteer gazetteer, CaseMatchingType defaultCaseMatchingType, bool defaultLemmatizeFlag, bool defaultEnabledFlag) { // default settings CaseMatchingType caseMatchingType = defaultCaseMatchingType; bool lemmatize = defaultLemmatizeFlag; bool enabled = defaultEnabledFlag; // parse term settings termDef = mConstraintRegex.Replace(termDef, new MatchEvaluator(delegate(Match m) { ParseGazetteerSettings(m.Value, ref caseMatchingType, ref lemmatize, ref enabled); return(""); })); ArrayList <string> tokens = new ArrayList <string>(); ArrayList <string> posConstraints = new ArrayList <string>(); Match match = mGazetteerMicroTokenRegex.Match(termDef); while (match.Success) { string token = match.Value; string[] tokenParts = token.Split('/'); string posConstraint = null; if (tokenParts.Length == 2) { token = tokenParts[0]; posConstraint = tokenParts[1]; } tokens.Add(token); posConstraints.Add(posConstraint); match = match.NextMatch(); } InitializeInstance(tokens, posConstraints, lemmatize, caseMatchingType, enabled, gazetteer); }
private void InitializeInstance(IEnumerable <string> tokens, IEnumerable <string> posConstraints, bool lemmatize, CaseMatchingType caseMatchingType, bool enabled, Gazetteer gazetteer) { mCaseMatchingType = caseMatchingType; mEnabled = enabled; IEnumerator <string> enumTokens = tokens.GetEnumerator(); IEnumerator <string> enumPosConstraints = posConstraints.GetEnumerator(); while (enumTokens.MoveNext() && enumPosConstraints.MoveNext()) { string tokenStr = Normalize(enumTokens.Current); string posConstraint = enumPosConstraints.Current; if (!gazetteer.IsStopWord(tokenStr.ToLower())) { string lemma = null; if (lemmatize) { lemma = mLemmatizer.GetStem(tokenStr); if (lemma == "") { lemma = tokenStr; } } GazetteerToken token = new GazetteerToken(tokenStr, posConstraint, lemma); mTokens.Add(token); } } if (mTokens.Count > 0) { PrepareTokens(caseMatchingType, lemmatize); } }
private static void ParseGazetteerSettings(string settingsStr, ref CaseMatchingType caseMatchingType, ref bool lemmatize, ref bool enabled, ref int minLen) { string[] settings = settingsStr.TrimStart('/').Split('/'); foreach (string setting in settings) { string[] keyVal = setting.Split('='); if (keyVal.Length == 2) { if (keyVal[0] == "e") // enabled { enabled = keyVal[1] != "n"; } else if (keyVal[0] == "l") // lemmatize { lemmatize = keyVal[1] == "y"; } else if (keyVal[0] == "ml") // minimum annotation length { minLen = Convert.ToInt32(keyVal[1]); } else if (keyVal[0] == "c") // case-matching type { if (keyVal[1] == "ic") { caseMatchingType = CaseMatchingType.IgnoreCase; } else if (keyVal[1] == "em") { caseMatchingType = CaseMatchingType.ExactMatch; } else if (keyVal[1] == "acs") { caseMatchingType = CaseMatchingType.AllCapsStrict; } else if (keyVal[1] == "acl") { caseMatchingType = CaseMatchingType.AllCapsLoose; } else if (keyVal[1] == "ics") { caseMatchingType = CaseMatchingType.InitCapStrict; } else if (keyVal[1] == "icl") { caseMatchingType = CaseMatchingType.InitCapLoose; } else if (keyVal[1] == "alc") { caseMatchingType = CaseMatchingType.AllLowercase; } else if (keyVal[1] == "auc") { caseMatchingType = CaseMatchingType.AllUppercase; } } } } }
private void PrepareTokens(CaseMatchingType caseMatchingType, bool processLemmas) { switch (caseMatchingType) { case CaseMatchingType.AllLowercase: case CaseMatchingType.IgnoreCase: foreach (GazetteerToken token in mTokens) { token.mTokenStr = token.mTokenStr.ToLower(); if (processLemmas) { token.mLemma = token.mLemma.ToLower(); } } break; case CaseMatchingType.InitCapStrict: case CaseMatchingType.InitCapLoose: foreach (GazetteerToken token in mTokens) { token.mTokenStr = token.mTokenStr.ToLower(); if (processLemmas) { token.mLemma = token.mLemma.ToLower(); } } mTokens[0].mTokenStr = char.ToUpper(mTokens[0].mTokenStr[0]) + mTokens[0].mTokenStr.Substring(1); if (processLemmas) { mTokens[0].mLemma = char.ToUpper(mTokens[0].mLemma[0]) + mTokens[0].mLemma.Substring(1); } break; case CaseMatchingType.AllCapsStrict: case CaseMatchingType.AllCapsLoose: foreach (GazetteerToken token in mTokens) { token.mTokenStr = char.ToUpper(token.mTokenStr[0]) + token.mTokenStr.Substring(1).ToLower(); if (processLemmas) { token.mLemma = char.ToUpper(token.mLemma[0]) + token.mLemma.Substring(1).ToLower(); } } break; case CaseMatchingType.AllUppercase: foreach (GazetteerToken token in mTokens) { token.mTokenStr = token.mTokenStr.ToUpper(); if (processLemmas) { token.mLemma = token.mLemma.ToUpper(); } } break; } }
private bool Match(GazetteerToken gazToken, Token docToken, CaseMatchingType caseMatchingType, bool firstToken) { // check POS tag if (gazToken.mPosConstraint != null && !docToken.mPosTag.StartsWith(gazToken.mPosConstraint)) { return(false); } // check word or lemma string gazTokenStr; string docTokenStr; if (gazToken.mLemma == null) { gazTokenStr = gazToken.mTokenStr; docTokenStr = docToken.mTokenStr; } else { gazTokenStr = gazToken.mLemma; docTokenStr = docToken.mLemma; } switch (caseMatchingType) { case CaseMatchingType.IgnoreCase: return(string.Compare(gazTokenStr, docTokenStr, StringComparison.OrdinalIgnoreCase) == 0); case CaseMatchingType.ExactMatch: case CaseMatchingType.AllLowercase: case CaseMatchingType.AllUppercase: case CaseMatchingType.AllCapsStrict: case CaseMatchingType.InitCapStrict: return(gazTokenStr == docTokenStr); case CaseMatchingType.InitCapLoose: return((!firstToken && string.Compare(gazTokenStr, docTokenStr, StringComparison.OrdinalIgnoreCase) == 0) || (firstToken && char.IsUpper(docTokenStr[0]) && string.Compare(gazTokenStr, docTokenStr, StringComparison.OrdinalIgnoreCase) == 0)); case CaseMatchingType.AllCapsLoose: return(char.IsUpper(docTokenStr[0]) && string.Compare(gazTokenStr, docTokenStr, StringComparison.OrdinalIgnoreCase) == 0); default: throw new ArgumentValueException("caseMatchingType"); } }
private void ReadGazetteerSettings(MemoryStore rdfStore, out CaseMatchingType caseMatchingType, out bool lemmatize, out bool enabled, out int minLen) { caseMatchingType = CaseMatchingType.IgnoreCase; lemmatize = false; enabled = true; minLen = 1; ArrayList <string> crumbs = new ArrayList <string>(new string[] { mUri }); Entity[] objects = rdfStore.SelectSubjects(P_IDENTIFIED_BY, new Entity(mUri)); if (objects.Length > 0) { Resource[] objTypes = rdfStore.SelectObjects(objects[0].Uri, P_TYPE); if (objTypes.Length > 0) { crumbs.Add(objTypes[0].Uri); Resource[] superClass = rdfStore.SelectObjects((Entity)objTypes[0], P_SUBCLASS_OF); while (superClass.Length > 0) { crumbs.Add(superClass[0].Uri); superClass = rdfStore.SelectObjects((Entity)superClass[0], P_SUBCLASS_OF); } } } crumbs.Reverse(); foreach (string uri in crumbs) { Resource[] settings = rdfStore.SelectObjects(uri, P_SETTINGS); if (settings.Length == 0) { settings = rdfStore.SelectObjects(uri, P_COMMENT); } // compatibility with OWL-DL if (settings.Length > 0) { string settingsStr = ((Literal)settings[0]).Value; ParseGazetteerSettings(settingsStr, ref caseMatchingType, ref lemmatize, ref enabled, ref minLen); } } }
private bool Match(GazetteerToken gazToken, Token docToken, CaseMatchingType caseMatchingType, bool firstToken) { // check POS tag if (gazToken.mPosConstraint != null && !docToken.mPosTag.StartsWith(gazToken.mPosConstraint)) { return false; } // check word or lemma string gazTokenStr; string docTokenStr; if (gazToken.mLemma == null) { gazTokenStr = gazToken.mTokenStr; docTokenStr = docToken.mTokenStr; } else { gazTokenStr = gazToken.mLemma; docTokenStr = docToken.mLemma; } switch (caseMatchingType) { case CaseMatchingType.IgnoreCase: return string.Compare(gazTokenStr, docTokenStr, StringComparison.OrdinalIgnoreCase) == 0; case CaseMatchingType.ExactMatch: case CaseMatchingType.AllLowercase: case CaseMatchingType.AllUppercase: case CaseMatchingType.AllCapsStrict: case CaseMatchingType.InitCapStrict: return gazTokenStr == docTokenStr; case CaseMatchingType.InitCapLoose: return (!firstToken && string.Compare(gazTokenStr, docTokenStr, StringComparison.OrdinalIgnoreCase) == 0) || (firstToken && char.IsUpper(docTokenStr[0]) && string.Compare(gazTokenStr, docTokenStr, StringComparison.OrdinalIgnoreCase) == 0); case CaseMatchingType.AllCapsLoose: return char.IsUpper(docTokenStr[0]) && string.Compare(gazTokenStr, docTokenStr, StringComparison.OrdinalIgnoreCase) == 0; default: throw new ArgumentValueException("caseMatchingType"); } }
private void InitializeInstance(IEnumerable<string> tokens, IEnumerable<string> posConstraints, bool lemmatize, CaseMatchingType caseMatchingType, bool enabled, Gazetteer gazetteer) { mCaseMatchingType = caseMatchingType; mEnabled = enabled; IEnumerator<string> enumTokens = tokens.GetEnumerator(); IEnumerator<string> enumPosConstraints = posConstraints.GetEnumerator(); while (enumTokens.MoveNext() && enumPosConstraints.MoveNext()) { string tokenStr = Normalize(enumTokens.Current); string posConstraint = enumPosConstraints.Current; if (!gazetteer.IsStopWord(tokenStr.ToLower())) { string lemma = null; if (lemmatize) { lemma = mLemmatizer.GetStem(tokenStr); if (lemma == "") { lemma = tokenStr; } } GazetteerToken token = new GazetteerToken(tokenStr, posConstraint, lemma); mTokens.Add(token); } } if (mTokens.Count > 0) { PrepareTokens(caseMatchingType, lemmatize); } }
//public GazetteerTerm(IEnumerable<string> tokens, IEnumerable<string> posConstraints, bool lemmatize, CaseMatchingType caseMatchingType, bool enabled, Gazetteer gazetteer) //{ // InitializeInstance(tokens, posConstraints, lemmatize, caseMatchingType, enabled, gazetteer); //} public GazetteerTerm(string termDef, Gazetteer gazetteer, CaseMatchingType defaultCaseMatchingType, bool defaultLemmatizeFlag, bool defaultEnabledFlag) { // default settings CaseMatchingType caseMatchingType = defaultCaseMatchingType; bool lemmatize = defaultLemmatizeFlag; bool enabled = defaultEnabledFlag; // parse term settings termDef = mConstraintRegex.Replace(termDef, new MatchEvaluator(delegate(Match m) { ParseGazetteerSettings(m.Value, ref caseMatchingType, ref lemmatize, ref enabled); return ""; })); ArrayList<string> tokens = new ArrayList<string>(); ArrayList<string> posConstraints = new ArrayList<string>(); Match match = mGazetteerMicroTokenRegex.Match(termDef); while (match.Success) { string token = match.Value; string[] tokenParts = token.Split('/'); string posConstraint = null; if (tokenParts.Length == 2) { token = tokenParts[0]; posConstraint = tokenParts[1]; } tokens.Add(token); posConstraints.Add(posConstraint); match = match.NextMatch(); } InitializeInstance(tokens, posConstraints, lemmatize, caseMatchingType, enabled, gazetteer); }
private void ReadGazetteerSettings(MemoryStore rdfStore, out CaseMatchingType caseMatchingType, out bool lemmatize, out bool enabled) { caseMatchingType = CaseMatchingType.IgnoreCase; lemmatize = false; enabled = true; ArrayList<string> crumbs = new ArrayList<string>(new string[] { mUri }); Entity[] objects = rdfStore.SelectSubjects(P_IDENTIFIED_BY, new Entity(mUri)); if (objects.Length > 0) { Resource[] objTypes = rdfStore.SelectObjects(objects[0].Uri, P_TYPE); if (objTypes.Length > 0) { crumbs.Add(objTypes[0].Uri); Resource[] superClass = rdfStore.SelectObjects((Entity)objTypes[0], P_SUBCLASS_OF); while (superClass.Length > 0) { crumbs.Add(superClass[0].Uri); superClass = rdfStore.SelectObjects((Entity)superClass[0], P_SUBCLASS_OF); } } } crumbs.Reverse(); foreach (string uri in crumbs) { Resource[] settings = rdfStore.SelectObjects(uri, P_SETTINGS); if (settings.Length == 0) { settings = rdfStore.SelectObjects(uri, P_COMMENT); } // for compatibility with OWL-DL if (settings.Length > 0) { string settingsStr = ((Literal)settings[0]).Value; ParseGazetteerSettings(settingsStr, ref caseMatchingType, ref lemmatize, ref enabled); } } }
private static void ParseGazetteerSettings(string settingsStr, ref CaseMatchingType caseMatchingType, ref bool lemmatize, ref bool enabled) { string[] settings = settingsStr.TrimStart('/').Split('/'); foreach (string setting in settings) { string[] keyVal = setting.Split('='); if (keyVal.Length == 2) { if (keyVal[0] == "e") // enabled { enabled = keyVal[1] != "n"; } else if (keyVal[0] == "l") // lemmatize { lemmatize = keyVal[1] == "y"; } else if (keyVal[0] == "c") // case-matching type { if (keyVal[1] == "ic") { caseMatchingType = CaseMatchingType.IgnoreCase; } else if (keyVal[1] == "em") { caseMatchingType = CaseMatchingType.ExactMatch; } else if (keyVal[1] == "acs") { caseMatchingType = CaseMatchingType.AllCapsStrict; } else if (keyVal[1] == "acl") { caseMatchingType = CaseMatchingType.AllCapsLoose; } else if (keyVal[1] == "ics") { caseMatchingType = CaseMatchingType.InitCapStrict; } else if (keyVal[1] == "icl") { caseMatchingType = CaseMatchingType.InitCapLoose; } else if (keyVal[1] == "alc") { caseMatchingType = CaseMatchingType.AllLowercase; } else if (keyVal[1] == "auc") { caseMatchingType = CaseMatchingType.AllUppercase; } } } } }