コード例 #1
0
            //public GazetteerTerm(IEnumerable<string> tokens, IEnumerable<string> posConstraints, bool lemmatize, CaseMatchingType caseMatchingType, bool enabled, Gazetteer gazetteer)
            //{
            //    InitializeInstance(tokens, posConstraints, lemmatize, caseMatchingType, enabled, gazetteer);
            //}

            public GazetteerTerm(string termDef, Gazetteer gazetteer, CaseMatchingType defaultCaseMatchingType, bool defaultLemmatizeFlag, bool defaultEnabledFlag)
            {
                // default settings
                CaseMatchingType caseMatchingType = defaultCaseMatchingType;
                bool             lemmatize        = defaultLemmatizeFlag;
                bool             enabled          = defaultEnabledFlag;

                // parse term settings
                termDef = mConstraintRegex.Replace(termDef, new MatchEvaluator(delegate(Match m) {
                    ParseGazetteerSettings(m.Value, ref caseMatchingType, ref lemmatize, ref enabled);
                    return("");
                }));
                ArrayList <string> tokens         = new ArrayList <string>();
                ArrayList <string> posConstraints = new ArrayList <string>();
                Match match = mGazetteerMicroTokenRegex.Match(termDef);

                while (match.Success)
                {
                    string   token         = match.Value;
                    string[] tokenParts    = token.Split('/');
                    string   posConstraint = null;
                    if (tokenParts.Length == 2)
                    {
                        token         = tokenParts[0];
                        posConstraint = tokenParts[1];
                    }
                    tokens.Add(token);
                    posConstraints.Add(posConstraint);
                    match = match.NextMatch();
                }
                InitializeInstance(tokens, posConstraints, lemmatize, caseMatchingType, enabled, gazetteer);
            }
コード例 #2
0
            private void InitializeInstance(IEnumerable <string> tokens, IEnumerable <string> posConstraints, bool lemmatize, CaseMatchingType caseMatchingType, bool enabled, Gazetteer gazetteer)
            {
                mCaseMatchingType = caseMatchingType;
                mEnabled          = enabled;
                IEnumerator <string> enumTokens         = tokens.GetEnumerator();
                IEnumerator <string> enumPosConstraints = posConstraints.GetEnumerator();

                while (enumTokens.MoveNext() && enumPosConstraints.MoveNext())
                {
                    string tokenStr      = Normalize(enumTokens.Current);
                    string posConstraint = enumPosConstraints.Current;
                    if (!gazetteer.IsStopWord(tokenStr.ToLower()))
                    {
                        string lemma = null;
                        if (lemmatize)
                        {
                            lemma = mLemmatizer.GetStem(tokenStr);
                            if (lemma == "")
                            {
                                lemma = tokenStr;
                            }
                        }
                        GazetteerToken token = new GazetteerToken(tokenStr, posConstraint, lemma);
                        mTokens.Add(token);
                    }
                }
                if (mTokens.Count > 0)
                {
                    PrepareTokens(caseMatchingType, lemmatize);
                }
            }
コード例 #3
0
 private static void ParseGazetteerSettings(string settingsStr, ref CaseMatchingType caseMatchingType, ref bool lemmatize, ref bool enabled,
                                            ref int minLen)
 {
     string[] settings = settingsStr.TrimStart('/').Split('/');
     foreach (string setting in settings)
     {
         string[] keyVal = setting.Split('=');
         if (keyVal.Length == 2)
         {
             if (keyVal[0] == "e") // enabled
             {
                 enabled = keyVal[1] != "n";
             }
             else if (keyVal[0] == "l") // lemmatize
             {
                 lemmatize = keyVal[1] == "y";
             }
             else if (keyVal[0] == "ml") // minimum annotation length
             {
                 minLen = Convert.ToInt32(keyVal[1]);
             }
             else if (keyVal[0] == "c") // case-matching type
             {
                 if (keyVal[1] == "ic")
                 {
                     caseMatchingType = CaseMatchingType.IgnoreCase;
                 }
                 else if (keyVal[1] == "em")
                 {
                     caseMatchingType = CaseMatchingType.ExactMatch;
                 }
                 else if (keyVal[1] == "acs")
                 {
                     caseMatchingType = CaseMatchingType.AllCapsStrict;
                 }
                 else if (keyVal[1] == "acl")
                 {
                     caseMatchingType = CaseMatchingType.AllCapsLoose;
                 }
                 else if (keyVal[1] == "ics")
                 {
                     caseMatchingType = CaseMatchingType.InitCapStrict;
                 }
                 else if (keyVal[1] == "icl")
                 {
                     caseMatchingType = CaseMatchingType.InitCapLoose;
                 }
                 else if (keyVal[1] == "alc")
                 {
                     caseMatchingType = CaseMatchingType.AllLowercase;
                 }
                 else if (keyVal[1] == "auc")
                 {
                     caseMatchingType = CaseMatchingType.AllUppercase;
                 }
             }
         }
     }
 }
コード例 #4
0
            private void PrepareTokens(CaseMatchingType caseMatchingType, bool processLemmas)
            {
                switch (caseMatchingType)
                {
                case CaseMatchingType.AllLowercase:
                case CaseMatchingType.IgnoreCase:
                    foreach (GazetteerToken token in mTokens)
                    {
                        token.mTokenStr = token.mTokenStr.ToLower();
                        if (processLemmas)
                        {
                            token.mLemma = token.mLemma.ToLower();
                        }
                    }
                    break;

                case CaseMatchingType.InitCapStrict:
                case CaseMatchingType.InitCapLoose:
                    foreach (GazetteerToken token in mTokens)
                    {
                        token.mTokenStr = token.mTokenStr.ToLower();
                        if (processLemmas)
                        {
                            token.mLemma = token.mLemma.ToLower();
                        }
                    }
                    mTokens[0].mTokenStr = char.ToUpper(mTokens[0].mTokenStr[0]) + mTokens[0].mTokenStr.Substring(1);
                    if (processLemmas)
                    {
                        mTokens[0].mLemma = char.ToUpper(mTokens[0].mLemma[0]) + mTokens[0].mLemma.Substring(1);
                    }
                    break;

                case CaseMatchingType.AllCapsStrict:
                case CaseMatchingType.AllCapsLoose:
                    foreach (GazetteerToken token in mTokens)
                    {
                        token.mTokenStr = char.ToUpper(token.mTokenStr[0]) + token.mTokenStr.Substring(1).ToLower();
                        if (processLemmas)
                        {
                            token.mLemma = char.ToUpper(token.mLemma[0]) + token.mLemma.Substring(1).ToLower();
                        }
                    }
                    break;

                case CaseMatchingType.AllUppercase:
                    foreach (GazetteerToken token in mTokens)
                    {
                        token.mTokenStr = token.mTokenStr.ToUpper();
                        if (processLemmas)
                        {
                            token.mLemma = token.mLemma.ToUpper();
                        }
                    }
                    break;
                }
            }
コード例 #5
0
            private bool Match(GazetteerToken gazToken, Token docToken, CaseMatchingType caseMatchingType, bool firstToken)
            {
                // check POS tag
                if (gazToken.mPosConstraint != null && !docToken.mPosTag.StartsWith(gazToken.mPosConstraint))
                {
                    return(false);
                }
                // check word or lemma
                string gazTokenStr;
                string docTokenStr;

                if (gazToken.mLemma == null)
                {
                    gazTokenStr = gazToken.mTokenStr;
                    docTokenStr = docToken.mTokenStr;
                }
                else
                {
                    gazTokenStr = gazToken.mLemma;
                    docTokenStr = docToken.mLemma;
                }
                switch (caseMatchingType)
                {
                case CaseMatchingType.IgnoreCase:
                    return(string.Compare(gazTokenStr, docTokenStr, StringComparison.OrdinalIgnoreCase) == 0);

                case CaseMatchingType.ExactMatch:
                case CaseMatchingType.AllLowercase:
                case CaseMatchingType.AllUppercase:
                case CaseMatchingType.AllCapsStrict:
                case CaseMatchingType.InitCapStrict:
                    return(gazTokenStr == docTokenStr);

                case CaseMatchingType.InitCapLoose:
                    return((!firstToken && string.Compare(gazTokenStr, docTokenStr, StringComparison.OrdinalIgnoreCase) == 0) ||
                           (firstToken && char.IsUpper(docTokenStr[0]) && string.Compare(gazTokenStr, docTokenStr, StringComparison.OrdinalIgnoreCase) == 0));

                case CaseMatchingType.AllCapsLoose:
                    return(char.IsUpper(docTokenStr[0]) && string.Compare(gazTokenStr, docTokenStr, StringComparison.OrdinalIgnoreCase) == 0);

                default:
                    throw new ArgumentValueException("caseMatchingType");
                }
            }
コード例 #6
0
            private void ReadGazetteerSettings(MemoryStore rdfStore, out CaseMatchingType caseMatchingType, out bool lemmatize, out bool enabled,
                                               out int minLen)
            {
                caseMatchingType = CaseMatchingType.IgnoreCase;
                lemmatize        = false;
                enabled          = true;
                minLen           = 1;
                ArrayList <string> crumbs = new ArrayList <string>(new string[] { mUri });

                Entity[] objects = rdfStore.SelectSubjects(P_IDENTIFIED_BY, new Entity(mUri));
                if (objects.Length > 0)
                {
                    Resource[] objTypes = rdfStore.SelectObjects(objects[0].Uri, P_TYPE);
                    if (objTypes.Length > 0)
                    {
                        crumbs.Add(objTypes[0].Uri);
                        Resource[] superClass = rdfStore.SelectObjects((Entity)objTypes[0], P_SUBCLASS_OF);
                        while (superClass.Length > 0)
                        {
                            crumbs.Add(superClass[0].Uri);
                            superClass = rdfStore.SelectObjects((Entity)superClass[0], P_SUBCLASS_OF);
                        }
                    }
                }
                crumbs.Reverse();
                foreach (string uri in crumbs)
                {
                    Resource[] settings = rdfStore.SelectObjects(uri, P_SETTINGS);
                    if (settings.Length == 0)
                    {
                        settings = rdfStore.SelectObjects(uri, P_COMMENT);
                    }                                                                                // compatibility with OWL-DL
                    if (settings.Length > 0)
                    {
                        string settingsStr = ((Literal)settings[0]).Value;
                        ParseGazetteerSettings(settingsStr, ref caseMatchingType, ref lemmatize, ref enabled, ref minLen);
                    }
                }
            }
コード例 #7
0
 private bool Match(GazetteerToken gazToken, Token docToken, CaseMatchingType caseMatchingType, bool firstToken)
 {
     // check POS tag
     if (gazToken.mPosConstraint != null && !docToken.mPosTag.StartsWith(gazToken.mPosConstraint)) { return false; }
     // check word or lemma
     string gazTokenStr;
     string docTokenStr;
     if (gazToken.mLemma == null)
     {
         gazTokenStr = gazToken.mTokenStr;
         docTokenStr = docToken.mTokenStr;
     }
     else
     {
         gazTokenStr = gazToken.mLemma;
         docTokenStr = docToken.mLemma;
     }
     switch (caseMatchingType)
     {
         case CaseMatchingType.IgnoreCase:
             return string.Compare(gazTokenStr, docTokenStr, StringComparison.OrdinalIgnoreCase) == 0;
         case CaseMatchingType.ExactMatch:
         case CaseMatchingType.AllLowercase:
         case CaseMatchingType.AllUppercase:
         case CaseMatchingType.AllCapsStrict:
         case CaseMatchingType.InitCapStrict:
             return gazTokenStr == docTokenStr;
         case CaseMatchingType.InitCapLoose:
             return (!firstToken && string.Compare(gazTokenStr, docTokenStr, StringComparison.OrdinalIgnoreCase) == 0)
                 || (firstToken && char.IsUpper(docTokenStr[0]) && string.Compare(gazTokenStr, docTokenStr, StringComparison.OrdinalIgnoreCase) == 0);
         case CaseMatchingType.AllCapsLoose:
             return char.IsUpper(docTokenStr[0]) && string.Compare(gazTokenStr, docTokenStr, StringComparison.OrdinalIgnoreCase) == 0;
         default:
             throw new ArgumentValueException("caseMatchingType");
     }
 }
コード例 #8
0
 private void PrepareTokens(CaseMatchingType caseMatchingType, bool processLemmas)
 {
     switch (caseMatchingType)
     {
         case CaseMatchingType.AllLowercase:
         case CaseMatchingType.IgnoreCase:
             foreach (GazetteerToken token in mTokens)
             {
                 token.mTokenStr = token.mTokenStr.ToLower();
                 if (processLemmas) { token.mLemma = token.mLemma.ToLower(); }
             }
             break;
         case CaseMatchingType.InitCapStrict:
         case CaseMatchingType.InitCapLoose:
             foreach (GazetteerToken token in mTokens)
             {
                 token.mTokenStr = token.mTokenStr.ToLower();
                 if (processLemmas) { token.mLemma = token.mLemma.ToLower(); }
             }
             mTokens[0].mTokenStr = char.ToUpper(mTokens[0].mTokenStr[0]) + mTokens[0].mTokenStr.Substring(1);
             if (processLemmas) { mTokens[0].mLemma = char.ToUpper(mTokens[0].mLemma[0]) + mTokens[0].mLemma.Substring(1); }
             break;
         case CaseMatchingType.AllCapsStrict:
         case CaseMatchingType.AllCapsLoose:
             foreach (GazetteerToken token in mTokens)
             {
                 token.mTokenStr = char.ToUpper(token.mTokenStr[0]) + token.mTokenStr.Substring(1).ToLower();
                 if (processLemmas) { token.mLemma = char.ToUpper(token.mLemma[0]) + token.mLemma.Substring(1).ToLower(); }
             }
             break;
         case CaseMatchingType.AllUppercase:
             foreach (GazetteerToken token in mTokens)
             {
                 token.mTokenStr = token.mTokenStr.ToUpper();
                 if (processLemmas) { token.mLemma = token.mLemma.ToUpper(); }
             }
             break;
     }
 }
コード例 #9
0
 private void InitializeInstance(IEnumerable<string> tokens, IEnumerable<string> posConstraints, bool lemmatize, CaseMatchingType caseMatchingType, bool enabled, Gazetteer gazetteer)
 {
     mCaseMatchingType = caseMatchingType;
     mEnabled = enabled;
     IEnumerator<string> enumTokens = tokens.GetEnumerator();
     IEnumerator<string> enumPosConstraints = posConstraints.GetEnumerator();
     while (enumTokens.MoveNext() && enumPosConstraints.MoveNext())
     {
         string tokenStr = Normalize(enumTokens.Current);
         string posConstraint = enumPosConstraints.Current;
         if (!gazetteer.IsStopWord(tokenStr.ToLower()))
         {
             string lemma = null;
             if (lemmatize)
             {
                 lemma = mLemmatizer.GetStem(tokenStr);
                 if (lemma == "") { lemma = tokenStr; }
             }
             GazetteerToken token = new GazetteerToken(tokenStr, posConstraint, lemma);
             mTokens.Add(token);
         }
     }
     if (mTokens.Count > 0)
     {
         PrepareTokens(caseMatchingType, lemmatize);
     }
 }
コード例 #10
0
 //public GazetteerTerm(IEnumerable<string> tokens, IEnumerable<string> posConstraints, bool lemmatize, CaseMatchingType caseMatchingType, bool enabled, Gazetteer gazetteer)
 //{
 //    InitializeInstance(tokens, posConstraints, lemmatize, caseMatchingType, enabled, gazetteer);
 //}
 public GazetteerTerm(string termDef, Gazetteer gazetteer, CaseMatchingType defaultCaseMatchingType, bool defaultLemmatizeFlag, bool defaultEnabledFlag)
 {
     // default settings
     CaseMatchingType caseMatchingType = defaultCaseMatchingType;
     bool lemmatize = defaultLemmatizeFlag;
     bool enabled = defaultEnabledFlag;
     // parse term settings
     termDef = mConstraintRegex.Replace(termDef, new MatchEvaluator(delegate(Match m) {
         ParseGazetteerSettings(m.Value, ref caseMatchingType, ref lemmatize, ref enabled);
         return "";
     }));
     ArrayList<string> tokens = new ArrayList<string>();
     ArrayList<string> posConstraints = new ArrayList<string>();
     Match match = mGazetteerMicroTokenRegex.Match(termDef);
     while (match.Success)
     {
         string token = match.Value;
         string[] tokenParts = token.Split('/');
         string posConstraint = null;
         if (tokenParts.Length == 2)
         {
             token = tokenParts[0];
             posConstraint = tokenParts[1];
         }
         tokens.Add(token);
         posConstraints.Add(posConstraint);
         match = match.NextMatch();
     }
     InitializeInstance(tokens, posConstraints, lemmatize, caseMatchingType, enabled, gazetteer);
 }
コード例 #11
0
 private void ReadGazetteerSettings(MemoryStore rdfStore, out CaseMatchingType caseMatchingType, out bool lemmatize, out bool enabled)
 {
     caseMatchingType = CaseMatchingType.IgnoreCase;
     lemmatize = false;
     enabled = true;
     ArrayList<string> crumbs = new ArrayList<string>(new string[] { mUri });
     Entity[] objects = rdfStore.SelectSubjects(P_IDENTIFIED_BY, new Entity(mUri));
     if (objects.Length > 0)
     {
         Resource[] objTypes = rdfStore.SelectObjects(objects[0].Uri, P_TYPE);
         if (objTypes.Length > 0)
         {
             crumbs.Add(objTypes[0].Uri);
             Resource[] superClass = rdfStore.SelectObjects((Entity)objTypes[0], P_SUBCLASS_OF);
             while (superClass.Length > 0)
             {
                 crumbs.Add(superClass[0].Uri);
                 superClass = rdfStore.SelectObjects((Entity)superClass[0], P_SUBCLASS_OF);
             }
         }
     }
     crumbs.Reverse();
     foreach (string uri in crumbs)
     {
         Resource[] settings = rdfStore.SelectObjects(uri, P_SETTINGS);
         if (settings.Length == 0) { settings = rdfStore.SelectObjects(uri, P_COMMENT); } // for compatibility with OWL-DL
         if (settings.Length > 0)
         {
             string settingsStr = ((Literal)settings[0]).Value;
             ParseGazetteerSettings(settingsStr, ref caseMatchingType, ref lemmatize, ref enabled);
         }
     }
 }
コード例 #12
0
 private static void ParseGazetteerSettings(string settingsStr, ref CaseMatchingType caseMatchingType, ref bool lemmatize, ref bool enabled)
 {
     string[] settings = settingsStr.TrimStart('/').Split('/');
     foreach (string setting in settings)
     {
         string[] keyVal = setting.Split('=');
         if (keyVal.Length == 2)
         {
             if (keyVal[0] == "e") // enabled
             {
                 enabled = keyVal[1] != "n";
             }
             else if (keyVal[0] == "l") // lemmatize
             {
                 lemmatize = keyVal[1] == "y";
             }
             else if (keyVal[0] == "c") // case-matching type
             {
                 if (keyVal[1] == "ic") { caseMatchingType = CaseMatchingType.IgnoreCase; }
                 else if (keyVal[1] == "em") { caseMatchingType = CaseMatchingType.ExactMatch; }
                 else if (keyVal[1] == "acs") { caseMatchingType = CaseMatchingType.AllCapsStrict; }
                 else if (keyVal[1] == "acl") { caseMatchingType = CaseMatchingType.AllCapsLoose; }
                 else if (keyVal[1] == "ics") { caseMatchingType = CaseMatchingType.InitCapStrict; }
                 else if (keyVal[1] == "icl") { caseMatchingType = CaseMatchingType.InitCapLoose; }
                 else if (keyVal[1] == "alc") { caseMatchingType = CaseMatchingType.AllLowercase; }
                 else if (keyVal[1] == "auc") { caseMatchingType = CaseMatchingType.AllUppercase; }
             }
         }
     }
 }