public List <CF> Query <CF>(contentRelationQueryType qType, contentRelationType qRelation, IContentElement qReference, int limit = -1)
        {
            List <CF>          output   = new List <CF>();
            contentElementList elements = Query(qRelation, qReference, limit);

            foreach (IContentElement element in elements)
            {
                switch (qType)
                {
                case contentRelationQueryType.gatherFlags:

                    IContentToken ict = element as IContentToken;
                    if (ict != null)
                    {
                        output.AddRange(ict.flags.getEnumListFromFlags <CF>());                 // output.populateWith(ict.flags);
                    }
                    break;

                case contentRelationQueryType.gatherSentenceFlags:
                    IContentSentence ics = element as IContentSentence;
                    if (ics != null)
                    {
                        output.AddRange(ics.sentenceFlags.getEnumListFromFlags <CF>());                 // output.populateWith(ics.sentenceFlags);
                    }
                    break;

                case contentRelationQueryType.gatherParagraphFlags:
                    IContentParagraph icp = element as IContentParagraph;
                    if (icp != null)
                    {
                        output.AddRange(icp.flags.getEnumListFromFlags <CF>());                 // output.populateWith(icp.flags);
                    }
                    // output.populateWith(icp.flags);
                    break;

                case contentRelationQueryType.gatherBlockTags:
                    IContentBlock icb = element as IContentBlock;
                    if (icb != null)
                    {
                        output.AddRange(icb.flags.getEnumListFromFlags <CF>());                 // output.populateWith(icb.flags);
                    }
                    break;

                case contentRelationQueryType.gatherOrigins:
                    throw new NotImplementedException("gatherOrigin");

                    //IContentToken icto = element as IContentToken;
                    //if (icto != null) output.Add((CF)icto.origin);
                    break;

                default:
                    //output.populateWith(element.flags);
                    break;
                }
            }
            return(output);
        }
Example #2
0
        /// <summary>
        /// Postavlja tokene u prosledjenu recenicu i vraca listu svih tokena
        /// </summary>
        /// <param name="content"></param>
        /// <param name="sentence"></param>
        /// <param name="language"></param>
        /// <returns></returns>
        internal List <T> setTokensFromContent <T>(string content, IContentSentence sentence)
            where T : IContentToken, new()
        {
            List <T> output = new List <T>();


            MatchCollection coll = tokenization.tokenSelectForWordsAndPunctation.Matches(content);

            //String[] rawTokens = wordSpliter.Split(content);

            foreach (Match pmt in coll)
            {
                int lastIndex = 0;
                foreach (Capture cp in pmt.Captures)
                {
                    string tkn = cp.Value;
                    if (string.IsNullOrEmpty(tkn))
                    {
                        break;
                    }
                    tkn = tkn.Trim();
                    if (string.IsNullOrEmpty(tkn))
                    {
                        break;
                    }

                    //cp.Index
                    T newToken = new T(); //(cp.Value, sentence, "", cp.Value);
                    newToken.content       = cp.Value;
                    newToken.parent        = sentence;
                    newToken.spliter       = "";
                    newToken.sourceContent = cp.Value;

                    if (sentence != null)
                    {
                        sentence.setItem(newToken);
                    }
                    output.Add(newToken);
                }
            }

            return(output);
        }
Example #3
0
        public override void primaryFlaging(params object[] resources)
        {
            items.ForEach(x => x.primaryFlaging(resources));

            if (items.Count == 1)
            {
                IContentSentence sen = (IContentSentence)items[0];
                if (sen.sentenceFlags.HasFlag(contentSentenceFlag.title))
                {
                    flags |= contentParagraphFlag.heading;
                    if (parent is IContentBlock)
                    {
                        IContentBlock bl = parent as IContentBlock;
                        bl.title += sen.content;
                    }
                }
            }
            // throw new System.NotImplementedException();
        }
        /// <summary>
        /// Postavlja tokene u prosledjenu recenicu i vraca listu svih tokena
        /// </summary>
        /// <param name="content"></param>
        /// <param name="sentence"></param>
        /// <param name="language"></param>
        /// <returns></returns>
        internal static List <T> setTokensFromContent <T, TS>(params object[] resources)
            where T : class, IContentToken, new()
            where TS : IContentSubSentence, new()
        {
            //logSystem.log("set tokens from content Sentence: " + sentence.content, logType.Notification);
            IContentSentence         sentence        = resources.getFirstOfType <IContentSentence>();
            contentPreprocessFlag    preprocessFlags = resources.getFirstOfType <contentPreprocessFlag>();
            subsentenceDetectionFlag subflags        = resources.getFirstOfType <subsentenceDetectionFlag>();
            tokenDetectionFlag       flags           = resources.getFirstOfType <tokenDetectionFlag>();

            //tokenDetectionFlag[] _flags

            List <T> output = new List <T>();

            try
            {
                //subsentenceDetectionFlags subflags = _subflags;
                // tokenDetectionFlags flags = _flags;

                string pcontent = preprocess.process(sentence.content, preprocessFlags);

                contentMatchCollection subsentenceMatches = subsentenceDetection.setSubSentences <TS>(sentence, subflags);

                foreach (contentMatch dt in subsentenceMatches.Values)
                {
                    IContentSubSentence ss = dt.element as IContentSubSentence;
                    sentence.items.Add(ss);
                    foreach (T sst in ss.items)
                    {
                        output.Add(sst);
                    }
                    //output.AddRange(ss.items);
                }

                List <IContentToken> directTokens = new List <IContentToken>();

                directTokens = setTokensForSentence <T>(sentence, true, flags, subsentenceMatches);

                if (directTokens != null)
                {
                    foreach (IContentToken dt in directTokens)
                    {
                        T tkn = dt as T;
                        if (tkn != null)
                        {
                            output.Add(tkn);
                        }
                    }
                }
                else
                {
                }

                sentence.content = pcontent;
            }
            catch (Exception ex)
            {
                var isb = new StringBuilder();
                isb.AppendLine("tokenDetection error");
                isb.AppendLine("Target is: " + sentence.toStringSafe());
                throw;
                // devNoteManager.note(sentence, ex, isb.ToString(), "tokenDetection", devNoteType.tokenization);
            }

            // logSystem.log("set tokens from content Sentence done", logType.Notification);
            return(output);
        }
        /// <summary>
        /// Pravi tokene za prosledjenu recenicu
        /// </summary>
        /// <typeparam name="T"></typeparam>
        /// <param name="sentence"></param>
        /// <param name="setTokenToSentence"></param>
        /// <param name="flags"></param>
        /// <param name="subsentenceMatches"></param>
        /// <returns></returns>
        private static List <IContentToken> setTokensForSentence <T>(IContentSentence sentence, bool setTokenToSentence, tokenDetectionFlag flag, contentMatchCollection subsentenceMatches = null) where T : IContentToken, new()
        {
            contentMatchCollection macroTokens = new contentMatchCollection();
            List <IContentToken>   output      = new List <IContentToken>();

            String scrambled = sentence.content;
            var    flags     = flag.getEnumListFromFlags();

            macroTokens.scrambled = sentence.content;

            foreach (tokenDetectionFlag fl in flags)
            {
                switch (fl)
                {
                case tokenDetectionFlag.emailAddress:
                    macroTokens.Add(_select_emailAddress, fl);
                    break;

                case tokenDetectionFlag.phonenumber:
                    macroTokens.Add(_select_phoneNumber, fl);
                    break;

                case tokenDetectionFlag.standard:
                    macroTokens.Add(_select_standards, fl);
                    break;
                }
            }

            foreach (tokenDetectionFlag fl in flags)
            {
                switch (fl)
                {
                case tokenDetectionFlag.yearNumber:
                    macroTokens.Add(_select_yearNumber, fl);
                    break;

                case tokenDetectionFlag.postOfficeNumber:
                    macroTokens.Add(_select_phoneNumber, fl);
                    break;
                }
            }

            foreach (tokenDetectionFlag fl in flags)
            {
                switch (fl)
                {
                case tokenDetectionFlag.acronims:
                    macroTokens.Add(_select_acronimIrregular, fl);
                    macroTokens.Add(_select_acronimByLength, fl);
                    break;
                }
            }

            // logSystem.log("    -- setTokensForSentence: quering performed "+ macroTokens.Count   , logType.Notification);

            if (flags.Contains(tokenDetectionFlag.standardDetection))
            {
                macroTokens.Add(_select_tokenWithSplitter, tokenDetectionFlag.none);
            }

            Int32 i  = 0;
            Int32 mx = sentence.content.Length;

            while (i < mx)
            {
                try
                {
                    #region LOOP

                    oneOrMore <contentMatch> cms = macroTokens.allocated(i, 1);

                    if (cms == null)
                    {
                        i = mx;
                        continue;
                    }

                    if (cms.isNothing)
                    {
                        i++;
                        continue;
                    }
                    else
                    {
                        contentMatch cm = cms.First();

                        //
                        if (cm == null)
                        {
                            // logSystem.log("        -- -- -- cm is null " + cm.toStringSafe(), logType.Notification);
                            i++;
                            continue;
                        }
                        else
                        {
                            //        logSystem.log("        -- -- -- cm found " + cm.toStringSafe(), logType.Notification);
                        }

                        i = i + cm.match.Length;

                        T      newToken = new T();
                        String mch      = cm.match.Value.Trim("#".ToCharArray());
                        newToken.sourceContent = mch;
                        newToken.content       = mch;

                        Match sp = _select_tokenWithSplitter.Match(mch);
                        if (sp.Success)
                        {
                            newToken.spliter = sp.Value;
                            newToken.content = newToken.content.removeEndsWith(newToken.spliter);
                        }
                        else
                        {
                            newToken.spliter = "";
                            newToken.content = mch;
                        }

                        IContentSentence _sentence = sentence;

                        if (setTokenToSentence)
                        {
                            if (subsentenceMatches != null)
                            {
                                if (subsentenceMatches.isAllocated(cm.match.Index, cm.match.Length))
                                {
                                    oneOrMore <contentMatch> subcms = subsentenceMatches.allocated(cm.match.Index,
                                                                                                   cm.match.Length);

                                    contentMatch subcm = subcms.First();
                                    if (subcm == null)
                                    {
                                        // logSystem.log("    -- -- -- sub cm is null  ", logType.Notification);
                                    }
                                    else
                                    {
                                        _sentence = subcm.element as IContentSubSentence;
                                    }
                                }
                                else
                                {
                                }
                            }
                            else
                            {
                            }
                            if (_sentence != null)
                            {
                                _sentence.setItem(newToken);
                                if (_sentence == sentence)
                                {
                                    output.Add(newToken);
                                }
                            }
                            else
                            {
                                logSystem.log("    -- -- -- _sentence is null  ", logType.Notification);
                            }
                        }
                        else
                        {
                            output.Add(newToken);
                        }

                        if (cm.associatedKey == null)
                        {
                            logSystem.log("    -- -- -- cm.associatedKey  is null ", logType.Notification);
                        }
                        else
                        {
                            tokenDetectionFlag fl = tokenDetectionFlag.none;

                            Boolean detected = Enum.TryParse(cm.associatedKey.toStringSafe(), true, out fl);

                            newToken.detectionFlags = fl;
                        }

                        cm.element = newToken;
                    }

                    #endregion LOOP
                }
                catch (Exception ex)
                {
                    var isb = new StringBuilder();
                    isb.AppendLine("loop error error");
                    isb.AppendLine("Target is: i=" + i + "[mx=" + mx + "]");
                    //devNoteManager.note(ex, isb.ToString(), "loop error", devNoteType.tokenization);
                }
            }
            return(output);
        }
Example #6
0
        /// <summary>
        /// Može da sam izvrši macroTokens ili da dobije gotove. Primenjuje subsentence algoritam i vrši standardnu detekciju tokena --- NAJBITNIJE JE STO SLAZE TOKENE/SUBSENTENCE u parent
        /// </summary>
        /// <typeparam name="T"></typeparam>
        /// <param name="resources"></param>
        /// <returns></returns>
        public virtual contentTokenCollection _setTokensForSentence <T>(params object[] resources)
            where T : IContentToken, new()
        {
            contentMatchCollection subsentenceMatches = resources.getFirstOfType <contentMatchCollection>();

            if (subsentenceMatches == null)
            {
                subsentenceMatches = new contentMatchCollection();
            }


            contentMatchCollection macroTokens = null;

            if (macroTokens == null)
            {
                macroTokens = _setMacroTokensForSentence <T>(resources);
            }

            contentTokenCollection output = resources.getFirstOfType <contentTokenCollection>();

            tokenDetectionFlag flags = resources.getFirstOfType <tokenDetectionFlag>();

            if (flags.HasFlag(tokenDetectionFlag.standardDetection))
            {
                macroTokens.Add(_select_tokenWithSplitter, tokenDetectionFlag.standardDetection);
            }


            string source = "";

            source = content;

            int      i            = 0;
            int      mx           = source.Length;
            int      sI           = 0;
            int      sLimit       = mx;
            DateTime processStart = DateTime.Now;

            while (i < mx)
            {
                try
                {
                    if (sI > sLimit)
                    {
                        aceLog.log("Content sentence tokenization broken");
                        break;
                    }
                    sI++;
                    #region LOOP

                    oneOrMore <contentMatch> cms = macroTokens.allocated(i, 1);

                    if (cms == null)
                    {
                        i = mx;
                        continue;
                    }

                    if (cms.isNothing)
                    {
                        i++;
                        continue;
                    }
                    else
                    {
                        contentMatch cm = cms.First();
                        if (cm == null)
                        {
                            i++;
                            continue;
                        }

                        i = i + cm.match.Length;

                        IContentToken newToken = new T();
                        string        mch      = cm.match.Value.Trim("#".ToCharArray());
                        newToken.sourceContent = mch;
                        newToken.content       = mch;


                        Match sp = _select_tokenSplitter.Match(mch);
                        if (sp.Success)
                        {
                            newToken.spliter = sp.Value;
                            newToken.content = newToken.content.removeEndsWith(newToken.spliter).Trim();
                        }
                        else
                        {
                            //if (cm.match.Groups.Count > 1)
                            //{
                            //    mch = cm.match.Groups[1].Value;
                            //}
                            //else
                            //{

                            //}
                            newToken.spliter = "";
                            newToken.content = mch.Trim();
                        }


                        if (DateTime.Now.Subtract(processStart).Minutes > 2)
                        {
                            aceLog.log("TOKENIZATION TIME LIMIT BROKEN !!!");
                            break;
                        }


                        IContentSentence _sentence = this;

                        if (cm.element is IContentSubSentence)
                        {
                            IContentSubSentence sub = cm.element as IContentSubSentence;
                            sub.masterToken = newToken;
                            newToken        = (IContentToken)cm.element;
                        }

                        /*
                         * if (subsentenceMatches.isAllocated(cm.match.Index, cm.match.Length))
                         * {
                         *  oneOrMore<contentMatch> subcms = subsentenceMatches.allocated(cm.match.Index, cm.match.Length);
                         *  contentMatch subcm = subcms.First();
                         *  if (subcm == null)
                         *  {
                         *      // logSystem.log("    -- -- -- sub cm is null  ", logType.Notification);
                         *  }
                         *  else
                         *  {
                         *      if (subcm.element != null)
                         *      {
                         *
                         *      }
                         *      _sentence = subcm.element as IContentSubSentence;
                         *
                         *      if (_sentence != null)
                         *      {
                         *          IContentSubSentence _subSentence = _sentence as IContentSubSentence;
                         *          newToken.flags.Add(contentTokenFlag.subsentence_inner);
                         *          _subSentence.masterToken = newToken;
                         *          newToken = (T)(_subSentence as IContentToken);
                         *
                         *
                         *
                         *
                         *          //_sentence.setItem(newToken);
                         *
                         *          //if (output.Contains(_sentence as IContentToken))
                         *          //{
                         *
                         *          //}
                         *          //else
                         *          //{
                         *
                         *          //     output.Add(_sentence as IContentToken);
                         *          //}
                         *
                         *      }
                         *      else
                         *      {
                         *         // output.Add(newToken);
                         *      }
                         *  }
                         *
                         *
                         * }*/



                        /*
                         * if (_sentence != null)
                         * {
                         *
                         *
                         *  //setItem(_sentence);
                         *
                         *
                         *  if (_sentence == this)
                         *  {
                         *
                         *      output.Add(newToken);
                         *  } else
                         *  {
                         *      setItem(newToken);
                         *      if (output.Contains(_sentence as IContentToken))
                         *      {
                         *
                         *      }
                         *      else
                         *      {
                         *         // output.Add(_sentence as IContentToken);
                         *      }
                         *
                         *  }
                         *
                         * }
                         * else
                         * {
                         *  setItem(newToken);
                         * }
                         */
                        if (cm.associatedKey != null)
                        {
                            tokenDetectionFlag fl = tokenDetectionFlag.none;
                            bool detected         = Enum.TryParse(cm.associatedKey.toStringSafe(), true, out fl);
                            newToken.detectionFlags = fl;
                        }
                        if (output.Contains(newToken))
                        {
                        }
                        else
                        {
                            if (newToken == this)
                            {
                            }
                            else
                            {
                                output.Add(newToken);
                            }
                        }
                    }

                    #endregion
                }
                catch (Exception ex)
                {
                    var isb = new StringBuilder();
                    isb.AppendLine("loop error error");
                    isb.AppendLine("Target is: i=" + i + "[mx=" + mx + "]");
                    throw new aceGeneralException(isb.ToString(), null, this, "Loop");
                    // devNoteManager.note(ex, isb.ToString(), "loop error", devNoteType.tokenization);
                }
            }


            return(output);
        }
Example #7
0
        /// <summary>
        /// Glavni metod za obradu sadrzaja jedne recenice >> prvo poziva setSubSentences, zatim setTokensForSentence
        /// </summary>
        /// <typeparam name="T"></typeparam>
        /// <typeparam name="TS"></typeparam>
        /// <param name="resources"> tokenDetectionFlags flags, contentTokenCollection contentTokenCollections</param>
        /// <returns></returns>
        public virtual contentTokenCollection setTokensFromContent <T, TS>(params object[] resources)
            where T : class, IContentToken, new()
            where TS : IContentSubSentence, new()
        {
            //logSystem.log("set tokens from content Sentence: " + sentence.content, logType.Notification);
            IContentSentence sentence = this;

            tokenDetectionFlag detection_flags = resources.getFirstOfType <tokenDetectionFlag>(); // new tokenDetectionFlags();

            contentTokenCollection tokenCollections = resources.getFirstOfType <contentTokenCollection>();

            if (tokenCollections == null)
            {
                tokenCollections = new contentTokenCollection();
            }

            contentMatchCollection subsentenceMatches = _setSubSentences <TS>(detection_flags, null);

            try
            {
                int subCount = 0;
                for (int dti = 0; dti < subsentenceMatches.Count; dti++)
                {
                    contentMatch dt = subsentenceMatches[subsentenceMatches.Keys.imbGetItemAt(dti).ToString()]; // subsentenceMatches[dti];

                    contentSubSentence ss = dt.element as contentSubSentence;

                    contentTokenCollection subtkns = new contentTokenCollection();
                    //var cs = ss._setTokensForSentence<T>(subtkns, subsentenceMatches, flags);
                    var cs = ss._setTokensForSentence <T>(subtkns, detection_flags);
                    //var cs = ss._setTokensForSentence<T>(tokenCollections, flags);
                    //var cs = tokenCollectionsss._set
                    //var cs = ss._setTokensForSentence<T>(flags);
                    for (int ci = 0; ci < cs.Count; ci++)
                    {
                        ss.setItem(cs[ci]);
                    }

                    //cs = ss._setTokensForSentence<T>(subtkns, subsentenceMatches);
                    // ss.items.AddRange(cs);

                    //  contentTokenCollection subtkns = ss.setTokensFromContent<T>(resources);

                    //ss.items.Add(ss);
                    //foreach (T sst in ss.items)
                    //{

                    //    tokenCollections.Add(sst);
                    //}
                    //tokenCollections.Add(ss);
                    //dt.element = ss;

                    //  subCount++;
                }

                List <IContentToken> directTokens = new List <IContentToken>();

                directTokens = _setTokensForSentence <T>(subsentenceMatches, detection_flags, tokenCollections, directTokens);

                if (directTokens != tokenCollections)
                {
                    for (int dti = 0; dti < directTokens.Count; dti++)
                    {
                        IContentToken dt = directTokens[dti];

                        T tkn = dt as T;
                        if (tkn != null)
                        {
                            tokenCollections.Add(tkn);
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                var isb = new StringBuilder();
                isb.AppendLine("tokenDetection error");
                isb.AppendLine("Target is: " + sentence.toStringSafe());
                throw;
                // devNoteManager.note(sentence, ex, isb.ToString(), "tokenDetection", devNoteType.tokenization);
            }

            foreach (var tk in tokenCollections)
            {
                //subsentenceMatches.allocated(tk.)
                setItem(tk);
            }

            // logSystem.log("set tokens from content Sentence done", logType.Notification);
            return(tokenCollections);
        }
        /// <summary>
        /// Vraca pod recenice za prosledjenu recenicu. sentence.content ce dobiti skremblovanu verziju - gde je izbaceno sve sto nije
        /// </summary>
        /// <typeparam name="T"></typeparam>
        /// <param name="content"></param>
        /// <param name="sentence"></param>
        /// <param name="page"></param>
        /// <param name="_subflags"></param>
        /// <returns></returns>
        public static contentMatchCollection setSubSentences <T>(IContentSentence sentence, subsentenceDetectionFlag _subflags) where T : IContentSubSentence, new()
        {
            // List<T> output = new List<T>();
            // logSystem.log("-- set sub sentences for: " + sentence.content, logType.Notification);
            contentMatchCollection subsentenceMatches = new contentMatchCollection();

            String scrambled = sentence.content;

            subsentenceMatches.scrambled = scrambled;

            var subflags = _subflags.getEnumListFromFlags();

            foreach (subsentenceDetectionFlag fl in subflags)
            {
                switch (fl)
                {
                case subsentenceDetectionFlag.enbracedSubSentences:
                    subsentenceMatches.Add(_select_enbracedSubSentence, fl);
                    break;

                case subsentenceDetectionFlag.enumerationSubSentences:
                    subsentenceMatches.Add(_select_enumerationSubSentence, fl);
                    break;

                case subsentenceDetectionFlag.quotationSubSentences:
                    subsentenceMatches.Add(_select_quotedSubSentence, fl);
                    break;
                }
            }

            //foreach (subsentenceDetectionFlag fl in subflags)
            //{
            //    switch (fl)
            //    {
            //        case subsentenceDetectionFlag.potentialPersonalNames:
            //            subsentenceMatches.Add(_select_potentialPersonalNames, fl);
            //            break;
            //        case subsentenceDetectionFlag.cityAndPostnumber:
            //            subsentenceMatches.Add(_select_potentialCityAndPost, fl);
            //            break;

            //    }

            //}

            foreach (subsentenceDetectionFlag fl in subflags)
            {
                switch (fl)
                {
                case subsentenceDetectionFlag.punctationSubSentences:
                    subsentenceMatches.Add(_select_innerSentence, fl);
                    break;
                }
            }

            foreach (contentMatch cm in subsentenceMatches.Values)
            {
                T subsentence = new T();
                subsentence.parent        = sentence;
                subsentence.sourceContent = cm.match.Value;
                subsentence.content       = cm.match.Value;

                //subsentence.detectionFlags.Add((subsentenceDetectionFlag)cm.associatedKey);

                //switch ((subsentenceDetectionFlag) cm.associatedKey)
                //{
                //    case subsentenceDetectionFlag.enbracedSubSentences:
                //        subsentence.flags.Add(contentTokenFlag.subsentence_inner);
                //        break;
                //    case subsentenceDetectionFlag.enumerationSubSentences:
                //        subsentence.flags.Add(contentTokenFlag.subsentence_enumeration);
                //        break;
                //    case subsentenceDetectionFlag.quotationSubSentences:
                //        subsentence.flags.Add(contentTokenFlag.subsentence_quoted);
                //        break;
                //    case subsentenceDetectionFlag.cityAndPostnumber:
                //        subsentence.flags.Add(contentTokenFlag.subsentence_information);
                //        break;
                //    case subsentenceDetectionFlag.punctationSubSentences:
                //        subsentence.flags.Add(contentTokenFlag.subsentence_inner);
                //        break;
                //    case subsentenceDetectionFlag.potentialPersonalNames:
                //        subsentence.flags.Add(contentTokenFlag.subsentence_information);
                //        break;

                //}

                cm.element = subsentence;
            }

            sentence.content = scrambled;
            // logSystem.log("-- set sub sentences done: ", logType.Notification);
            return(subsentenceMatches);
        }