Example #1
0
        /// <summary>
        /// Vraca sve contentMatch instance koje se nalaze od indexa do kraja lengtha
        /// </summary>
        /// <param name="index"></param>
        /// <param name="length"></param>
        /// <returns></returns>
        public oneOrMore <contentMatch> allocated(int index, int length)
        {
            oneOrMore <contentMatch> output = new oneOrMore <contentMatch>();

            if (Count == 0)
            {
                return(output);
            }
            contentMatch cm  = null;
            int          ind = index;
            int          sc  = 0;
            int          scl = 10000;

            while (ind < index + length)
            {
                if (allocation.ContainsKey(ind))
                {
                    cm = allocation[ind];
                    if (cm == null)
                    {
                    }
                    else
                    {
                        if (!output.Contains(cm))
                        {
                            output.Add(cm);
                        }

                        if (cm.match.Length > 0)
                        {
                            ind = ind + cm.match.Length;
                        }
                        else
                        {
                            ind++;
                        }
                    }
                }
                else
                {
                    ind++;
                }
                sc++;
                if (sc > scl)
                {
                    throw new aceGeneralException("allocated(" + index + "," + length + ") failed: safety count is triggered [" +
                                                  sc + "]");
                    break;
                }
            }
            return(output);
        }
        /// <summary>
        /// Pravi tokene za prosledjenu recenicu
        /// </summary>
        /// <typeparam name="T"></typeparam>
        /// <param name="sentence"></param>
        /// <param name="setTokenToSentence"></param>
        /// <param name="flags"></param>
        /// <param name="subsentenceMatches"></param>
        /// <returns></returns>
        private static List <IContentToken> setTokensForSentence <T>(IContentSentence sentence, bool setTokenToSentence, tokenDetectionFlag flag, contentMatchCollection subsentenceMatches = null) where T : IContentToken, new()
        {
            contentMatchCollection macroTokens = new contentMatchCollection();
            List <IContentToken>   output      = new List <IContentToken>();

            String scrambled = sentence.content;
            var    flags     = flag.getEnumListFromFlags();

            macroTokens.scrambled = sentence.content;

            foreach (tokenDetectionFlag fl in flags)
            {
                switch (fl)
                {
                case tokenDetectionFlag.emailAddress:
                    macroTokens.Add(_select_emailAddress, fl);
                    break;

                case tokenDetectionFlag.phonenumber:
                    macroTokens.Add(_select_phoneNumber, fl);
                    break;

                case tokenDetectionFlag.standard:
                    macroTokens.Add(_select_standards, fl);
                    break;
                }
            }

            foreach (tokenDetectionFlag fl in flags)
            {
                switch (fl)
                {
                case tokenDetectionFlag.yearNumber:
                    macroTokens.Add(_select_yearNumber, fl);
                    break;

                case tokenDetectionFlag.postOfficeNumber:
                    macroTokens.Add(_select_phoneNumber, fl);
                    break;
                }
            }

            foreach (tokenDetectionFlag fl in flags)
            {
                switch (fl)
                {
                case tokenDetectionFlag.acronims:
                    macroTokens.Add(_select_acronimIrregular, fl);
                    macroTokens.Add(_select_acronimByLength, fl);
                    break;
                }
            }

            // logSystem.log("    -- setTokensForSentence: quering performed "+ macroTokens.Count   , logType.Notification);

            if (flags.Contains(tokenDetectionFlag.standardDetection))
            {
                macroTokens.Add(_select_tokenWithSplitter, tokenDetectionFlag.none);
            }

            Int32 i  = 0;
            Int32 mx = sentence.content.Length;

            while (i < mx)
            {
                try
                {
                    #region LOOP

                    oneOrMore <contentMatch> cms = macroTokens.allocated(i, 1);

                    if (cms == null)
                    {
                        i = mx;
                        continue;
                    }

                    if (cms.isNothing)
                    {
                        i++;
                        continue;
                    }
                    else
                    {
                        contentMatch cm = cms.First();

                        //
                        if (cm == null)
                        {
                            // logSystem.log("        -- -- -- cm is null " + cm.toStringSafe(), logType.Notification);
                            i++;
                            continue;
                        }
                        else
                        {
                            //        logSystem.log("        -- -- -- cm found " + cm.toStringSafe(), logType.Notification);
                        }

                        i = i + cm.match.Length;

                        T      newToken = new T();
                        String mch      = cm.match.Value.Trim("#".ToCharArray());
                        newToken.sourceContent = mch;
                        newToken.content       = mch;

                        Match sp = _select_tokenWithSplitter.Match(mch);
                        if (sp.Success)
                        {
                            newToken.spliter = sp.Value;
                            newToken.content = newToken.content.removeEndsWith(newToken.spliter);
                        }
                        else
                        {
                            newToken.spliter = "";
                            newToken.content = mch;
                        }

                        IContentSentence _sentence = sentence;

                        if (setTokenToSentence)
                        {
                            if (subsentenceMatches != null)
                            {
                                if (subsentenceMatches.isAllocated(cm.match.Index, cm.match.Length))
                                {
                                    oneOrMore <contentMatch> subcms = subsentenceMatches.allocated(cm.match.Index,
                                                                                                   cm.match.Length);

                                    contentMatch subcm = subcms.First();
                                    if (subcm == null)
                                    {
                                        // logSystem.log("    -- -- -- sub cm is null  ", logType.Notification);
                                    }
                                    else
                                    {
                                        _sentence = subcm.element as IContentSubSentence;
                                    }
                                }
                                else
                                {
                                }
                            }
                            else
                            {
                            }
                            if (_sentence != null)
                            {
                                _sentence.setItem(newToken);
                                if (_sentence == sentence)
                                {
                                    output.Add(newToken);
                                }
                            }
                            else
                            {
                                logSystem.log("    -- -- -- _sentence is null  ", logType.Notification);
                            }
                        }
                        else
                        {
                            output.Add(newToken);
                        }

                        if (cm.associatedKey == null)
                        {
                            logSystem.log("    -- -- -- cm.associatedKey  is null ", logType.Notification);
                        }
                        else
                        {
                            tokenDetectionFlag fl = tokenDetectionFlag.none;

                            Boolean detected = Enum.TryParse(cm.associatedKey.toStringSafe(), true, out fl);

                            newToken.detectionFlags = fl;
                        }

                        cm.element = newToken;
                    }

                    #endregion LOOP
                }
                catch (Exception ex)
                {
                    var isb = new StringBuilder();
                    isb.AppendLine("loop error error");
                    isb.AppendLine("Target is: i=" + i + "[mx=" + mx + "]");
                    //devNoteManager.note(ex, isb.ToString(), "loop error", devNoteType.tokenization);
                }
            }
            return(output);
        }
Example #3
0
        /// <summary>
        /// Može da sam izvrši macroTokens ili da dobije gotove. Primenjuje subsentence algoritam i vrši standardnu detekciju tokena --- NAJBITNIJE JE STO SLAZE TOKENE/SUBSENTENCE u parent
        /// </summary>
        /// <typeparam name="T"></typeparam>
        /// <param name="resources"></param>
        /// <returns></returns>
        public virtual contentTokenCollection _setTokensForSentence <T>(params object[] resources)
            where T : IContentToken, new()
        {
            contentMatchCollection subsentenceMatches = resources.getFirstOfType <contentMatchCollection>();

            if (subsentenceMatches == null)
            {
                subsentenceMatches = new contentMatchCollection();
            }


            contentMatchCollection macroTokens = null;

            if (macroTokens == null)
            {
                macroTokens = _setMacroTokensForSentence <T>(resources);
            }

            contentTokenCollection output = resources.getFirstOfType <contentTokenCollection>();

            tokenDetectionFlag flags = resources.getFirstOfType <tokenDetectionFlag>();

            if (flags.HasFlag(tokenDetectionFlag.standardDetection))
            {
                macroTokens.Add(_select_tokenWithSplitter, tokenDetectionFlag.standardDetection);
            }


            string source = "";

            source = content;

            int      i            = 0;
            int      mx           = source.Length;
            int      sI           = 0;
            int      sLimit       = mx;
            DateTime processStart = DateTime.Now;

            while (i < mx)
            {
                try
                {
                    if (sI > sLimit)
                    {
                        aceLog.log("Content sentence tokenization broken");
                        break;
                    }
                    sI++;
                    #region LOOP

                    oneOrMore <contentMatch> cms = macroTokens.allocated(i, 1);

                    if (cms == null)
                    {
                        i = mx;
                        continue;
                    }

                    if (cms.isNothing)
                    {
                        i++;
                        continue;
                    }
                    else
                    {
                        contentMatch cm = cms.First();
                        if (cm == null)
                        {
                            i++;
                            continue;
                        }

                        i = i + cm.match.Length;

                        IContentToken newToken = new T();
                        string        mch      = cm.match.Value.Trim("#".ToCharArray());
                        newToken.sourceContent = mch;
                        newToken.content       = mch;


                        Match sp = _select_tokenSplitter.Match(mch);
                        if (sp.Success)
                        {
                            newToken.spliter = sp.Value;
                            newToken.content = newToken.content.removeEndsWith(newToken.spliter).Trim();
                        }
                        else
                        {
                            //if (cm.match.Groups.Count > 1)
                            //{
                            //    mch = cm.match.Groups[1].Value;
                            //}
                            //else
                            //{

                            //}
                            newToken.spliter = "";
                            newToken.content = mch.Trim();
                        }


                        if (DateTime.Now.Subtract(processStart).Minutes > 2)
                        {
                            aceLog.log("TOKENIZATION TIME LIMIT BROKEN !!!");
                            break;
                        }


                        IContentSentence _sentence = this;

                        if (cm.element is IContentSubSentence)
                        {
                            IContentSubSentence sub = cm.element as IContentSubSentence;
                            sub.masterToken = newToken;
                            newToken        = (IContentToken)cm.element;
                        }

                        /*
                         * if (subsentenceMatches.isAllocated(cm.match.Index, cm.match.Length))
                         * {
                         *  oneOrMore<contentMatch> subcms = subsentenceMatches.allocated(cm.match.Index, cm.match.Length);
                         *  contentMatch subcm = subcms.First();
                         *  if (subcm == null)
                         *  {
                         *      // logSystem.log("    -- -- -- sub cm is null  ", logType.Notification);
                         *  }
                         *  else
                         *  {
                         *      if (subcm.element != null)
                         *      {
                         *
                         *      }
                         *      _sentence = subcm.element as IContentSubSentence;
                         *
                         *      if (_sentence != null)
                         *      {
                         *          IContentSubSentence _subSentence = _sentence as IContentSubSentence;
                         *          newToken.flags.Add(contentTokenFlag.subsentence_inner);
                         *          _subSentence.masterToken = newToken;
                         *          newToken = (T)(_subSentence as IContentToken);
                         *
                         *
                         *
                         *
                         *          //_sentence.setItem(newToken);
                         *
                         *          //if (output.Contains(_sentence as IContentToken))
                         *          //{
                         *
                         *          //}
                         *          //else
                         *          //{
                         *
                         *          //     output.Add(_sentence as IContentToken);
                         *          //}
                         *
                         *      }
                         *      else
                         *      {
                         *         // output.Add(newToken);
                         *      }
                         *  }
                         *
                         *
                         * }*/



                        /*
                         * if (_sentence != null)
                         * {
                         *
                         *
                         *  //setItem(_sentence);
                         *
                         *
                         *  if (_sentence == this)
                         *  {
                         *
                         *      output.Add(newToken);
                         *  } else
                         *  {
                         *      setItem(newToken);
                         *      if (output.Contains(_sentence as IContentToken))
                         *      {
                         *
                         *      }
                         *      else
                         *      {
                         *         // output.Add(_sentence as IContentToken);
                         *      }
                         *
                         *  }
                         *
                         * }
                         * else
                         * {
                         *  setItem(newToken);
                         * }
                         */
                        if (cm.associatedKey != null)
                        {
                            tokenDetectionFlag fl = tokenDetectionFlag.none;
                            bool detected         = Enum.TryParse(cm.associatedKey.toStringSafe(), true, out fl);
                            newToken.detectionFlags = fl;
                        }
                        if (output.Contains(newToken))
                        {
                        }
                        else
                        {
                            if (newToken == this)
                            {
                            }
                            else
                            {
                                output.Add(newToken);
                            }
                        }
                    }

                    #endregion
                }
                catch (Exception ex)
                {
                    var isb = new StringBuilder();
                    isb.AppendLine("loop error error");
                    isb.AppendLine("Target is: i=" + i + "[mx=" + mx + "]");
                    throw new aceGeneralException(isb.ToString(), null, this, "Loop");
                    // devNoteManager.note(ex, isb.ToString(), "loop error", devNoteType.tokenization);
                }
            }


            return(output);
        }