예제 #1
0
        /*
         * /// <summary>
         * /// Osnovna obrada na osnovu jezika> da li je poznata rec u pitanju ili nije -
         * /// </summary>
         * /// <param name="token"></param>
         * /// <param name="language"></param>
         * private static void deployTokenLanguageBasic(IContentToken token, basicLanguage language)
         * {
         *
         *  switch (token.genericType)
         *  {
         *      case nlpTokenGenericType.unknownWord:
         *          if (language.isKnownWord(token.content))
         *          {
         *              token.genericType = nlpTokenGenericType.knownWord;
         *          }
         *          break;
         *      case nlpTokenGenericType.number:
         *          break;
         *
         *  }
         *
         * }
         */

        /*
         * /// <summary>
         * /// FAZA 3: Dodatna obrada tokena na osnovu jezickih podesavanja -- nije jos implementirano!!!
         * /// </summary>
         * /// <param name="token"></param>
         * /// <param name="language"></param>
         * private static void deployTokenLanguage(IContentToken token, basicLanguage language)
         * {
         *
         *  switch (token.genericType)
         *  {
         *      case nlpTokenGenericType.unknownWord:
         *
         *          //if (language.testBoolean(token.content, basicLanguageCheck.spellCheck))
         *          //{
         *          //    token.genericType = nlpTokenGenericType.knownWord;
         *          //}
         *
         *          // token.wordVariations = languageTools.test<List<string>>(language, token.content, languageModelOperation.getVariations) as List<string>;
         *          // List<string> stems = languageTools.test<List<string>>(language, token.content, languageModelOperation.getStems) as List<string>;
         *          //token.wordRoot = imbStringOperations.longestCommonSubstring(token.wordVariations);
         *
         *          //token.wordRoot = stems[0];
         *          break;
         *      case nlpTokenGenericType.number:
         *
         *
         *          break;
         *
         *  }
         *
         * }
         */
        /// <summary>
        /// FAZA 2: podesava letter case, proverava jezik, proverava da li je mozda akronim - funkcionise samo ako su detektovani slogovi
        /// </summary>
        /// <param name="token"></param>
        /// <param name="language"></param>
        /// <returns></returns>
        private static nlpTokenGenericType findGenericTypeSecond(IContentToken token, basicLanguage language)
        {
            nlpTokenGenericType output = token.genericType;
            object testOut;

            /*
             *
             * if (token.tokenBaseType == nlpTokenBaseType.word)
             * {
             *  token.letterCase = nlpTextCase.unknown;
             *  if (tokenization.wordWithCapitalStart.IsMatch(token.content)) token.letterCase = nlpTextCase.firstUpperRestLower;
             *  if (token.letterCase == nlpTextCase.unknown) if (token.content.ToLower() == token.content) token.letterCase = nlpTextCase.lowerCase;
             *  if (token.letterCase == nlpTextCase.unknown) if (token.content.ToUpper() == token.content) token.letterCase = nlpTextCase.upperCase;
             *  if (token.letterCase == nlpTextCase.unknown) token.letterCase = nlpTextCase.mixedCase;
             * }
             */

            if (token.flags == contentTokenFlag.languageWord)
            {
                if (language.testBoolean(token.content, basicLanguageCheck.spellCheck))
                {
                    token.flags = token.flags.Add(contentTokenFlag.languageKnownWord);

                    output = nlpTokenGenericType.knownWord;
                }
                else
                {
                    if (token.flags.getEnumListFromFlags().ContainsOneOrMore(contentTokenFlag.acronim, contentTokenFlag.acronimDiscovered, contentTokenFlag.acronimKnown))
                    {
                        output = nlpTokenGenericType.wordAbrevation;
                    }
                    else
                    {
                        if (token.flags.HasFlag(contentTokenFlag.caseAllUpper))
                        {
                            contentToken pt = token.parent as contentToken;
                            if (pt != null)
                            {
                                if (pt.flags.HasFlag(contentTokenFlag.subsentence_title))
                                {
                                    token.flags = token.flags.Add(contentTokenFlag.title);
                                }
                                else if (pt.flags.HasFlag(contentTokenFlag.subsentence_information))
                                {
                                    token.flags = token.flags.Add(contentTokenFlag.namedEntity);
                                }
                            }
                            else
                            {
                                token.flags = token.flags.Add(contentTokenFlag.titleOneWord);
                            }
                        }
                        else if (token.flags.HasFlag(contentTokenFlag.caseFirstUpper))
                        {
                            contentToken pt = token.parent as contentToken;
                            if (pt != null)
                            {
                                if (pt.flags.HasFlag(contentTokenFlag.subsentence_title))
                                {
                                    token.flags = token.flags.Add(contentTokenFlag.title);
                                }
                                else if (pt.flags.HasFlag(contentTokenFlag.subsentence_information))
                                {
                                    token.flags = token.flags.Add(contentTokenFlag.namedEntity);
                                }
                                else
                                {
                                    if (!token.isFirst)
                                    {
                                        token.flags = token.flags.Add(contentTokenFlag.namedEntity);
                                    }
                                }
                            }
                        }
                    }
                }
            }
            token.genericType = output;

            return(output);
        }
예제 #2
0
        public override void secondaryFlaging(params object[] resources)
        {
            IContentToken _sub = parent as IContentToken;

            if (_sub != null)
            {
                if (_sub.detectionFlags.HasFlag(tokenDetectionFlag.cityAndPostnumberSubSentences))
                {
                    flags.Add(contentTokenFlag.cityName);
                }
                if (_sub.detectionFlags.HasFlag(tokenDetectionFlag.potentialPersonalNamesSubSentences))
                {
                    flags.Add(contentTokenFlag.personalNameOrLastname);
                }
            }

            if (page != null)
            {
                bool checkHeads = false;
                if (flags.ContainsAll(contentTokenFlag.caseAllUpper, contentTokenFlag.languageWord))
                {
                    checkHeads = true;
                }
                else if (flags.ContainsAll(contentTokenFlag.caseFirstUpper, contentTokenFlag.languageWord, contentTokenFlag.languageUnknownWord))
                {
                    checkHeads = true;
                }
                else if (flags.HasFlag(contentTokenFlag.namedEntity))
                {
                    checkHeads = true;
                }

                if (checkHeads)
                {
                    foreach (contentToken tkn in page.headTokens)
                    {
                        if (content.ToLower() == tkn.content)
                        {
                            if (tkn.origin == contentTokenOrigin.domain)
                            {
                                flags = flags.Add(contentTokenFlag.namedEntity);
                                flags = flags.Add(contentTokenFlag.namedEntityDiscovered);
                            }
                            if (tkn.origin == contentTokenOrigin.title)
                            {
                                if (flags.HasFlag(contentTokenFlag.languageUnknownWord))
                                {
                                    flags = flags.Add(contentTokenFlag.namedEntity);
                                }
                                else
                                {
                                    flags = flags.Add(contentTokenFlag.title);
                                }
                            }
                        }
                    }
                }
            }

            //if (this.items.Query<contentTokenFlags>(enums.contentRelationQueryType.gatherFlags, enums.contentRelationType.manyNext, this, 2).Contains(contentTokenFlag.acronim))
            if (ContainsOneOrMore(contentRelationType.manyNext, 2, contentTokenFlag.acronim, contentTokenFlag.acronimKnown))
            {
                if (flags.HasFlag(contentTokenFlag.namedEntity))
                {
                    flags = flags.Add(contentTokenFlag.namedEntityDiscovered);
                }
                else
                {
                }
            }

            //}

            baseType = nlpTokenBaseType.unknown;
            var fl = flags.getEnumListFromFlags();

            if (fl.ContainsOneOrMore(contentTokenFlag.languageWord, contentTokenFlag.title,
                                     contentTokenFlag.namedEntity, contentTokenFlag.languageKnownWord))
            {
                baseType = nlpTokenBaseType.word;
            }

            if (fl.ContainsOneOrMore(contentTokenFlag.number, contentTokenFlag.numberFormatted,
                                     contentTokenFlag.yearNumber, contentTokenFlag.zipCodeNumber, contentTokenFlag.internationalStandard))
            {
                baseType = nlpTokenBaseType.number;
            }

            if (fl.ContainsOneOrMore(contentTokenFlag.numberFormatted))
            {
                baseType = nlpTokenBaseType.mixed;
            }

            items.ForEach(x => x.secondaryFlaging(resources));
            // throw new NotImplementedException();
        }
예제 #3
0
        /// <summary>
        /// Glavni metod za obradu sadrzaja jedne recenice >> prvo poziva setSubSentences, zatim setTokensForSentence
        /// </summary>
        /// <typeparam name="T"></typeparam>
        /// <typeparam name="TS"></typeparam>
        /// <param name="resources"> tokenDetectionFlags flags, contentTokenCollection contentTokenCollections</param>
        /// <returns></returns>
        public virtual contentTokenCollection setTokensFromContent <T, TS>(params object[] resources)
            where T : class, IContentToken, new()
            where TS : IContentSubSentence, new()
        {
            //logSystem.log("set tokens from content Sentence: " + sentence.content, logType.Notification);
            IContentSentence sentence = this;

            tokenDetectionFlag detection_flags = resources.getFirstOfType <tokenDetectionFlag>(); // new tokenDetectionFlags();

            contentTokenCollection tokenCollections = resources.getFirstOfType <contentTokenCollection>();

            if (tokenCollections == null)
            {
                tokenCollections = new contentTokenCollection();
            }

            contentMatchCollection subsentenceMatches = _setSubSentences <TS>(detection_flags, null);

            try
            {
                int subCount = 0;
                for (int dti = 0; dti < subsentenceMatches.Count; dti++)
                {
                    contentMatch dt = subsentenceMatches[subsentenceMatches.Keys.imbGetItemAt(dti).ToString()]; // subsentenceMatches[dti];

                    contentSubSentence ss = dt.element as contentSubSentence;

                    contentTokenCollection subtkns = new contentTokenCollection();
                    //var cs = ss._setTokensForSentence<T>(subtkns, subsentenceMatches, flags);
                    var cs = ss._setTokensForSentence <T>(subtkns, detection_flags);
                    //var cs = ss._setTokensForSentence<T>(tokenCollections, flags);
                    //var cs = tokenCollectionsss._set
                    //var cs = ss._setTokensForSentence<T>(flags);
                    for (int ci = 0; ci < cs.Count; ci++)
                    {
                        ss.setItem(cs[ci]);
                    }

                    //cs = ss._setTokensForSentence<T>(subtkns, subsentenceMatches);
                    // ss.items.AddRange(cs);

                    //  contentTokenCollection subtkns = ss.setTokensFromContent<T>(resources);

                    //ss.items.Add(ss);
                    //foreach (T sst in ss.items)
                    //{

                    //    tokenCollections.Add(sst);
                    //}
                    //tokenCollections.Add(ss);
                    //dt.element = ss;

                    //  subCount++;
                }

                List <IContentToken> directTokens = new List <IContentToken>();

                directTokens = _setTokensForSentence <T>(subsentenceMatches, detection_flags, tokenCollections, directTokens);

                if (directTokens != tokenCollections)
                {
                    for (int dti = 0; dti < directTokens.Count; dti++)
                    {
                        IContentToken dt = directTokens[dti];

                        T tkn = dt as T;
                        if (tkn != null)
                        {
                            tokenCollections.Add(tkn);
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                var isb = new StringBuilder();
                isb.AppendLine("tokenDetection error");
                isb.AppendLine("Target is: " + sentence.toStringSafe());
                throw;
                // devNoteManager.note(sentence, ex, isb.ToString(), "tokenDetection", devNoteType.tokenization);
            }

            foreach (var tk in tokenCollections)
            {
                //subsentenceMatches.allocated(tk.)
                setItem(tk);
            }

            // logSystem.log("set tokens from content Sentence done", logType.Notification);
            return(tokenCollections);
        }
예제 #4
0
        /// <summary>
        /// FAZA 1: Osnovni nivo detekcije generickog tipa - koristi niz REGEX testova da bi utvrdio o kakvom se tokenu radi, da li je rec ili nije rec. Ako je rec cisti content tako da ostane samo rec.
        /// </summary>
        /// <param name="content"></param>
        /// <returns></returns>
        private static nlpTokenGenericType findGenericTypeBasic(IContentToken source)
        {
            Regex reg = null;
            nlpTokenGenericType output = nlpTokenGenericType.unknown;

            if (string.IsNullOrEmpty(source.content))
            {
                output = nlpTokenGenericType.empty;
                return(output);
            }

            if (string.IsNullOrWhiteSpace(source.content))
            {
                output = nlpTokenGenericType.empty;
                return(output);
            }

            if (tokenization.numericSelect.IsMatch(source.content))
            {
                // ima brojeva
                if (tokenization.numberOrdinal.IsMatch(source.sourceContent))
                {
                    output = nlpTokenGenericType.numberOrdinal;
                }
                else
                {
                    if (tokenization.numbersFormatedExpr.IsMatch(source.sourceContent))
                    {
                        output = nlpTokenGenericType.numberFormated;
                    }
                    else
                    {
                        if (tokenization.lettersSelect.IsMatch(source.content))
                        {
                            output = nlpTokenGenericType.mixedAlfanumeric;
                        }
                        else
                        {
                            output = nlpTokenGenericType.number;
                        }
                    }
                }
            }
            else
            {
                if (tokenization.lettersSelect.IsMatch(source.content))
                {
                    // ima slova
                    Match flw = tokenization.firstLetterWord.Match(source.content);

                    if (flw.Success)
                    {
                        output = nlpTokenGenericType.unknownWord;

                        if (source.content.Contains('@'))
                        {
                            if (tokenization.emailExpr.IsMatch(source.content))
                            {
                                output = nlpTokenGenericType.email;
                            }
                        }
                    }
                    else
                    {
                        if (tokenization.selectPunctation.IsMatch(source.content))
                        {
                            output = nlpTokenGenericType.mixedAlfasymbolic;
                            // nema brojeva
                        }
                        else
                        {
                            output = nlpTokenGenericType.unknownWord;
                        }
                    }
                }
                else
                {
                    if (tokenization.selectPunctation.IsMatch(source.content))
                    {
                        output = nlpTokenGenericType.symbols;
                        // nema brojeva
                    }
                    else
                    {
                        output = nlpTokenGenericType.unknown;
                    }
                }
            }

            if (genericToBaseType(output) == nlpTokenBaseType.word)
            {
                string clean = tokenization.samoRec.Match(source.content).Value;
                source.content = clean;
                source.spliter = source.sourceContent.Replace(clean, "");
            }

            return(output);
        }