Example #1
0
        /// <summary>
        /// Postavlja tokene u prosledjenu recenicu i vraca listu svih tokena
        /// </summary>
        /// <param name="content"></param>
        /// <param name="sentence"></param>
        /// <param name="language"></param>
        /// <returns></returns>
        internal List <T> setTokensFromContent <T>(string content, IContentSentence sentence)
            where T : IContentToken, new()
        {
            List <T> output = new List <T>();


            MatchCollection coll = tokenization.tokenSelectForWordsAndPunctation.Matches(content);

            //String[] rawTokens = wordSpliter.Split(content);

            foreach (Match pmt in coll)
            {
                int lastIndex = 0;
                foreach (Capture cp in pmt.Captures)
                {
                    string tkn = cp.Value;
                    if (string.IsNullOrEmpty(tkn))
                    {
                        break;
                    }
                    tkn = tkn.Trim();
                    if (string.IsNullOrEmpty(tkn))
                    {
                        break;
                    }

                    //cp.Index
                    T newToken = new T(); //(cp.Value, sentence, "", cp.Value);
                    newToken.content       = cp.Value;
                    newToken.parent        = sentence;
                    newToken.spliter       = "";
                    newToken.sourceContent = cp.Value;

                    if (sentence != null)
                    {
                        sentence.setItem(newToken);
                    }
                    output.Add(newToken);
                }
            }

            return(output);
        }
        /// <summary>
        /// Pravi tokene za prosledjenu recenicu
        /// </summary>
        /// <typeparam name="T"></typeparam>
        /// <param name="sentence"></param>
        /// <param name="setTokenToSentence"></param>
        /// <param name="flags"></param>
        /// <param name="subsentenceMatches"></param>
        /// <returns></returns>
        private static List <IContentToken> setTokensForSentence <T>(IContentSentence sentence, bool setTokenToSentence, tokenDetectionFlag flag, contentMatchCollection subsentenceMatches = null) where T : IContentToken, new()
        {
            contentMatchCollection macroTokens = new contentMatchCollection();
            List <IContentToken>   output      = new List <IContentToken>();

            String scrambled = sentence.content;
            var    flags     = flag.getEnumListFromFlags();

            macroTokens.scrambled = sentence.content;

            foreach (tokenDetectionFlag fl in flags)
            {
                switch (fl)
                {
                case tokenDetectionFlag.emailAddress:
                    macroTokens.Add(_select_emailAddress, fl);
                    break;

                case tokenDetectionFlag.phonenumber:
                    macroTokens.Add(_select_phoneNumber, fl);
                    break;

                case tokenDetectionFlag.standard:
                    macroTokens.Add(_select_standards, fl);
                    break;
                }
            }

            foreach (tokenDetectionFlag fl in flags)
            {
                switch (fl)
                {
                case tokenDetectionFlag.yearNumber:
                    macroTokens.Add(_select_yearNumber, fl);
                    break;

                case tokenDetectionFlag.postOfficeNumber:
                    macroTokens.Add(_select_phoneNumber, fl);
                    break;
                }
            }

            foreach (tokenDetectionFlag fl in flags)
            {
                switch (fl)
                {
                case tokenDetectionFlag.acronims:
                    macroTokens.Add(_select_acronimIrregular, fl);
                    macroTokens.Add(_select_acronimByLength, fl);
                    break;
                }
            }

            // logSystem.log("    -- setTokensForSentence: quering performed "+ macroTokens.Count   , logType.Notification);

            if (flags.Contains(tokenDetectionFlag.standardDetection))
            {
                macroTokens.Add(_select_tokenWithSplitter, tokenDetectionFlag.none);
            }

            Int32 i  = 0;
            Int32 mx = sentence.content.Length;

            while (i < mx)
            {
                try
                {
                    #region LOOP

                    oneOrMore <contentMatch> cms = macroTokens.allocated(i, 1);

                    if (cms == null)
                    {
                        i = mx;
                        continue;
                    }

                    if (cms.isNothing)
                    {
                        i++;
                        continue;
                    }
                    else
                    {
                        contentMatch cm = cms.First();

                        //
                        if (cm == null)
                        {
                            // logSystem.log("        -- -- -- cm is null " + cm.toStringSafe(), logType.Notification);
                            i++;
                            continue;
                        }
                        else
                        {
                            //        logSystem.log("        -- -- -- cm found " + cm.toStringSafe(), logType.Notification);
                        }

                        i = i + cm.match.Length;

                        T      newToken = new T();
                        String mch      = cm.match.Value.Trim("#".ToCharArray());
                        newToken.sourceContent = mch;
                        newToken.content       = mch;

                        Match sp = _select_tokenWithSplitter.Match(mch);
                        if (sp.Success)
                        {
                            newToken.spliter = sp.Value;
                            newToken.content = newToken.content.removeEndsWith(newToken.spliter);
                        }
                        else
                        {
                            newToken.spliter = "";
                            newToken.content = mch;
                        }

                        IContentSentence _sentence = sentence;

                        if (setTokenToSentence)
                        {
                            if (subsentenceMatches != null)
                            {
                                if (subsentenceMatches.isAllocated(cm.match.Index, cm.match.Length))
                                {
                                    oneOrMore <contentMatch> subcms = subsentenceMatches.allocated(cm.match.Index,
                                                                                                   cm.match.Length);

                                    contentMatch subcm = subcms.First();
                                    if (subcm == null)
                                    {
                                        // logSystem.log("    -- -- -- sub cm is null  ", logType.Notification);
                                    }
                                    else
                                    {
                                        _sentence = subcm.element as IContentSubSentence;
                                    }
                                }
                                else
                                {
                                }
                            }
                            else
                            {
                            }
                            if (_sentence != null)
                            {
                                _sentence.setItem(newToken);
                                if (_sentence == sentence)
                                {
                                    output.Add(newToken);
                                }
                            }
                            else
                            {
                                logSystem.log("    -- -- -- _sentence is null  ", logType.Notification);
                            }
                        }
                        else
                        {
                            output.Add(newToken);
                        }

                        if (cm.associatedKey == null)
                        {
                            logSystem.log("    -- -- -- cm.associatedKey  is null ", logType.Notification);
                        }
                        else
                        {
                            tokenDetectionFlag fl = tokenDetectionFlag.none;

                            Boolean detected = Enum.TryParse(cm.associatedKey.toStringSafe(), true, out fl);

                            newToken.detectionFlags = fl;
                        }

                        cm.element = newToken;
                    }

                    #endregion LOOP
                }
                catch (Exception ex)
                {
                    var isb = new StringBuilder();
                    isb.AppendLine("loop error error");
                    isb.AppendLine("Target is: i=" + i + "[mx=" + mx + "]");
                    //devNoteManager.note(ex, isb.ToString(), "loop error", devNoteType.tokenization);
                }
            }
            return(output);
        }