Beispiel #1
0
        /// <summary>
        /// Može da sam izvrši macroTokens ili da dobije gotove. Primenjuje subsentence algoritam i vrši standardnu detekciju tokena --- NAJBITNIJE JE STO SLAZE TOKENE/SUBSENTENCE u parent
        /// </summary>
        /// <typeparam name="T"></typeparam>
        /// <param name="resources"></param>
        /// <returns></returns>
        public virtual contentTokenCollection _setTokensForSentence <T>(params object[] resources)
            where T : IContentToken, new()
        {
            contentMatchCollection subsentenceMatches = resources.getFirstOfType <contentMatchCollection>();

            if (subsentenceMatches == null)
            {
                subsentenceMatches = new contentMatchCollection();
            }


            contentMatchCollection macroTokens = null;

            if (macroTokens == null)
            {
                macroTokens = _setMacroTokensForSentence <T>(resources);
            }

            contentTokenCollection output = resources.getFirstOfType <contentTokenCollection>();

            tokenDetectionFlag flags = resources.getFirstOfType <tokenDetectionFlag>();

            if (flags.HasFlag(tokenDetectionFlag.standardDetection))
            {
                macroTokens.Add(_select_tokenWithSplitter, tokenDetectionFlag.standardDetection);
            }


            string source = "";

            source = content;

            int      i            = 0;
            int      mx           = source.Length;
            int      sI           = 0;
            int      sLimit       = mx;
            DateTime processStart = DateTime.Now;

            while (i < mx)
            {
                try
                {
                    if (sI > sLimit)
                    {
                        aceLog.log("Content sentence tokenization broken");
                        break;
                    }
                    sI++;
                    #region LOOP

                    oneOrMore <contentMatch> cms = macroTokens.allocated(i, 1);

                    if (cms == null)
                    {
                        i = mx;
                        continue;
                    }

                    if (cms.isNothing)
                    {
                        i++;
                        continue;
                    }
                    else
                    {
                        contentMatch cm = cms.First();
                        if (cm == null)
                        {
                            i++;
                            continue;
                        }

                        i = i + cm.match.Length;

                        IContentToken newToken = new T();
                        string        mch      = cm.match.Value.Trim("#".ToCharArray());
                        newToken.sourceContent = mch;
                        newToken.content       = mch;


                        Match sp = _select_tokenSplitter.Match(mch);
                        if (sp.Success)
                        {
                            newToken.spliter = sp.Value;
                            newToken.content = newToken.content.removeEndsWith(newToken.spliter).Trim();
                        }
                        else
                        {
                            //if (cm.match.Groups.Count > 1)
                            //{
                            //    mch = cm.match.Groups[1].Value;
                            //}
                            //else
                            //{

                            //}
                            newToken.spliter = "";
                            newToken.content = mch.Trim();
                        }


                        if (DateTime.Now.Subtract(processStart).Minutes > 2)
                        {
                            aceLog.log("TOKENIZATION TIME LIMIT BROKEN !!!");
                            break;
                        }


                        IContentSentence _sentence = this;

                        if (cm.element is IContentSubSentence)
                        {
                            IContentSubSentence sub = cm.element as IContentSubSentence;
                            sub.masterToken = newToken;
                            newToken        = (IContentToken)cm.element;
                        }

                        /*
                         * if (subsentenceMatches.isAllocated(cm.match.Index, cm.match.Length))
                         * {
                         *  oneOrMore<contentMatch> subcms = subsentenceMatches.allocated(cm.match.Index, cm.match.Length);
                         *  contentMatch subcm = subcms.First();
                         *  if (subcm == null)
                         *  {
                         *      // logSystem.log("    -- -- -- sub cm is null  ", logType.Notification);
                         *  }
                         *  else
                         *  {
                         *      if (subcm.element != null)
                         *      {
                         *
                         *      }
                         *      _sentence = subcm.element as IContentSubSentence;
                         *
                         *      if (_sentence != null)
                         *      {
                         *          IContentSubSentence _subSentence = _sentence as IContentSubSentence;
                         *          newToken.flags.Add(contentTokenFlag.subsentence_inner);
                         *          _subSentence.masterToken = newToken;
                         *          newToken = (T)(_subSentence as IContentToken);
                         *
                         *
                         *
                         *
                         *          //_sentence.setItem(newToken);
                         *
                         *          //if (output.Contains(_sentence as IContentToken))
                         *          //{
                         *
                         *          //}
                         *          //else
                         *          //{
                         *
                         *          //     output.Add(_sentence as IContentToken);
                         *          //}
                         *
                         *      }
                         *      else
                         *      {
                         *         // output.Add(newToken);
                         *      }
                         *  }
                         *
                         *
                         * }*/



                        /*
                         * if (_sentence != null)
                         * {
                         *
                         *
                         *  //setItem(_sentence);
                         *
                         *
                         *  if (_sentence == this)
                         *  {
                         *
                         *      output.Add(newToken);
                         *  } else
                         *  {
                         *      setItem(newToken);
                         *      if (output.Contains(_sentence as IContentToken))
                         *      {
                         *
                         *      }
                         *      else
                         *      {
                         *         // output.Add(_sentence as IContentToken);
                         *      }
                         *
                         *  }
                         *
                         * }
                         * else
                         * {
                         *  setItem(newToken);
                         * }
                         */
                        if (cm.associatedKey != null)
                        {
                            tokenDetectionFlag fl = tokenDetectionFlag.none;
                            bool detected         = Enum.TryParse(cm.associatedKey.toStringSafe(), true, out fl);
                            newToken.detectionFlags = fl;
                        }
                        if (output.Contains(newToken))
                        {
                        }
                        else
                        {
                            if (newToken == this)
                            {
                            }
                            else
                            {
                                output.Add(newToken);
                            }
                        }
                    }

                    #endregion
                }
                catch (Exception ex)
                {
                    var isb = new StringBuilder();
                    isb.AppendLine("loop error error");
                    isb.AppendLine("Target is: i=" + i + "[mx=" + mx + "]");
                    throw new aceGeneralException(isb.ToString(), null, this, "Loop");
                    // devNoteManager.note(ex, isb.ToString(), "loop error", devNoteType.tokenization);
                }
            }


            return(output);
        }
        /// <summary>
        /// Pravi tokene za prosledjenu recenicu
        /// </summary>
        /// <typeparam name="T"></typeparam>
        /// <param name="sentence"></param>
        /// <param name="setTokenToSentence"></param>
        /// <param name="flags"></param>
        /// <param name="subsentenceMatches"></param>
        /// <returns></returns>
        private static List <IContentToken> setTokensForSentence <T>(IContentSentence sentence, bool setTokenToSentence, tokenDetectionFlag flag, contentMatchCollection subsentenceMatches = null) where T : IContentToken, new()
        {
            contentMatchCollection macroTokens = new contentMatchCollection();
            List <IContentToken>   output      = new List <IContentToken>();

            String scrambled = sentence.content;
            var    flags     = flag.getEnumListFromFlags();

            macroTokens.scrambled = sentence.content;

            foreach (tokenDetectionFlag fl in flags)
            {
                switch (fl)
                {
                case tokenDetectionFlag.emailAddress:
                    macroTokens.Add(_select_emailAddress, fl);
                    break;

                case tokenDetectionFlag.phonenumber:
                    macroTokens.Add(_select_phoneNumber, fl);
                    break;

                case tokenDetectionFlag.standard:
                    macroTokens.Add(_select_standards, fl);
                    break;
                }
            }

            foreach (tokenDetectionFlag fl in flags)
            {
                switch (fl)
                {
                case tokenDetectionFlag.yearNumber:
                    macroTokens.Add(_select_yearNumber, fl);
                    break;

                case tokenDetectionFlag.postOfficeNumber:
                    macroTokens.Add(_select_phoneNumber, fl);
                    break;
                }
            }

            foreach (tokenDetectionFlag fl in flags)
            {
                switch (fl)
                {
                case tokenDetectionFlag.acronims:
                    macroTokens.Add(_select_acronimIrregular, fl);
                    macroTokens.Add(_select_acronimByLength, fl);
                    break;
                }
            }

            // logSystem.log("    -- setTokensForSentence: quering performed "+ macroTokens.Count   , logType.Notification);

            if (flags.Contains(tokenDetectionFlag.standardDetection))
            {
                macroTokens.Add(_select_tokenWithSplitter, tokenDetectionFlag.none);
            }

            Int32 i  = 0;
            Int32 mx = sentence.content.Length;

            while (i < mx)
            {
                try
                {
                    #region LOOP

                    oneOrMore <contentMatch> cms = macroTokens.allocated(i, 1);

                    if (cms == null)
                    {
                        i = mx;
                        continue;
                    }

                    if (cms.isNothing)
                    {
                        i++;
                        continue;
                    }
                    else
                    {
                        contentMatch cm = cms.First();

                        //
                        if (cm == null)
                        {
                            // logSystem.log("        -- -- -- cm is null " + cm.toStringSafe(), logType.Notification);
                            i++;
                            continue;
                        }
                        else
                        {
                            //        logSystem.log("        -- -- -- cm found " + cm.toStringSafe(), logType.Notification);
                        }

                        i = i + cm.match.Length;

                        T      newToken = new T();
                        String mch      = cm.match.Value.Trim("#".ToCharArray());
                        newToken.sourceContent = mch;
                        newToken.content       = mch;

                        Match sp = _select_tokenWithSplitter.Match(mch);
                        if (sp.Success)
                        {
                            newToken.spliter = sp.Value;
                            newToken.content = newToken.content.removeEndsWith(newToken.spliter);
                        }
                        else
                        {
                            newToken.spliter = "";
                            newToken.content = mch;
                        }

                        IContentSentence _sentence = sentence;

                        if (setTokenToSentence)
                        {
                            if (subsentenceMatches != null)
                            {
                                if (subsentenceMatches.isAllocated(cm.match.Index, cm.match.Length))
                                {
                                    oneOrMore <contentMatch> subcms = subsentenceMatches.allocated(cm.match.Index,
                                                                                                   cm.match.Length);

                                    contentMatch subcm = subcms.First();
                                    if (subcm == null)
                                    {
                                        // logSystem.log("    -- -- -- sub cm is null  ", logType.Notification);
                                    }
                                    else
                                    {
                                        _sentence = subcm.element as IContentSubSentence;
                                    }
                                }
                                else
                                {
                                }
                            }
                            else
                            {
                            }
                            if (_sentence != null)
                            {
                                _sentence.setItem(newToken);
                                if (_sentence == sentence)
                                {
                                    output.Add(newToken);
                                }
                            }
                            else
                            {
                                logSystem.log("    -- -- -- _sentence is null  ", logType.Notification);
                            }
                        }
                        else
                        {
                            output.Add(newToken);
                        }

                        if (cm.associatedKey == null)
                        {
                            logSystem.log("    -- -- -- cm.associatedKey  is null ", logType.Notification);
                        }
                        else
                        {
                            tokenDetectionFlag fl = tokenDetectionFlag.none;

                            Boolean detected = Enum.TryParse(cm.associatedKey.toStringSafe(), true, out fl);

                            newToken.detectionFlags = fl;
                        }

                        cm.element = newToken;
                    }

                    #endregion LOOP
                }
                catch (Exception ex)
                {
                    var isb = new StringBuilder();
                    isb.AppendLine("loop error error");
                    isb.AppendLine("Target is: i=" + i + "[mx=" + mx + "]");
                    //devNoteManager.note(ex, isb.ToString(), "loop error", devNoteType.tokenization);
                }
            }
            return(output);
        }
Beispiel #3
0
        /// <summary>
        /// Glavni metod za obradu sadrzaja jedne recenice >> prvo poziva setSubSentences, zatim setTokensForSentence
        /// </summary>
        /// <typeparam name="T"></typeparam>
        /// <typeparam name="TS"></typeparam>
        /// <param name="resources"> tokenDetectionFlags flags, contentTokenCollection contentTokenCollections</param>
        /// <returns></returns>
        public virtual contentTokenCollection setTokensFromContent <T, TS>(params object[] resources)
            where T : class, IContentToken, new()
            where TS : IContentSubSentence, new()
        {
            //logSystem.log("set tokens from content Sentence: " + sentence.content, logType.Notification);
            IContentSentence sentence = this;

            tokenDetectionFlag detection_flags = resources.getFirstOfType <tokenDetectionFlag>(); // new tokenDetectionFlags();

            contentTokenCollection tokenCollections = resources.getFirstOfType <contentTokenCollection>();

            if (tokenCollections == null)
            {
                tokenCollections = new contentTokenCollection();
            }

            contentMatchCollection subsentenceMatches = _setSubSentences <TS>(detection_flags, null);

            try
            {
                int subCount = 0;
                for (int dti = 0; dti < subsentenceMatches.Count; dti++)
                {
                    contentMatch dt = subsentenceMatches[subsentenceMatches.Keys.imbGetItemAt(dti).ToString()]; // subsentenceMatches[dti];

                    contentSubSentence ss = dt.element as contentSubSentence;

                    contentTokenCollection subtkns = new contentTokenCollection();
                    //var cs = ss._setTokensForSentence<T>(subtkns, subsentenceMatches, flags);
                    var cs = ss._setTokensForSentence <T>(subtkns, detection_flags);
                    //var cs = ss._setTokensForSentence<T>(tokenCollections, flags);
                    //var cs = tokenCollectionsss._set
                    //var cs = ss._setTokensForSentence<T>(flags);
                    for (int ci = 0; ci < cs.Count; ci++)
                    {
                        ss.setItem(cs[ci]);
                    }

                    //cs = ss._setTokensForSentence<T>(subtkns, subsentenceMatches);
                    // ss.items.AddRange(cs);

                    //  contentTokenCollection subtkns = ss.setTokensFromContent<T>(resources);

                    //ss.items.Add(ss);
                    //foreach (T sst in ss.items)
                    //{

                    //    tokenCollections.Add(sst);
                    //}
                    //tokenCollections.Add(ss);
                    //dt.element = ss;

                    //  subCount++;
                }

                List <IContentToken> directTokens = new List <IContentToken>();

                directTokens = _setTokensForSentence <T>(subsentenceMatches, detection_flags, tokenCollections, directTokens);

                if (directTokens != tokenCollections)
                {
                    for (int dti = 0; dti < directTokens.Count; dti++)
                    {
                        IContentToken dt = directTokens[dti];

                        T tkn = dt as T;
                        if (tkn != null)
                        {
                            tokenCollections.Add(tkn);
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                var isb = new StringBuilder();
                isb.AppendLine("tokenDetection error");
                isb.AppendLine("Target is: " + sentence.toStringSafe());
                throw;
                // devNoteManager.note(sentence, ex, isb.ToString(), "tokenDetection", devNoteType.tokenization);
            }

            foreach (var tk in tokenCollections)
            {
                //subsentenceMatches.allocated(tk.)
                setItem(tk);
            }

            // logSystem.log("set tokens from content Sentence done", logType.Notification);
            return(tokenCollections);
        }