/// <summary>
 /// Initializes a new instance of the <see cref="htmlLinkNodeCollection"/> class.
 /// </summary>
 /// <param name="tkns">The TKNS.</param>
 public htmlLinkNodeCollection(contentTokenCollection tkns)
 {
     foreach (IHtmlContentElement tkn in tkns)
     {
         if (!scoped.Contains(tkn))
         {
             var lnk = tkn.linkRootParent;
             if (lnk != null)
             {
                 Add(lnk);
             }
         }
     }
 }
Exemplo n.º 2
0
 public contentSentence()
 {
     items = new contentTokenCollection();
 }
Exemplo n.º 3
0
        /// <summary>
        /// Glavni metod za obradu sadrzaja jedne recenice >> prvo poziva setSubSentences, zatim setTokensForSentence
        /// </summary>
        /// <typeparam name="T"></typeparam>
        /// <typeparam name="TS"></typeparam>
        /// <param name="resources"> tokenDetectionFlags flags, contentTokenCollection contentTokenCollections</param>
        /// <returns></returns>
        public virtual contentTokenCollection setTokensFromContent <T, TS>(params object[] resources)
            where T : class, IContentToken, new()
            where TS : IContentSubSentence, new()
        {
            //logSystem.log("set tokens from content Sentence: " + sentence.content, logType.Notification);
            IContentSentence sentence = this;

            tokenDetectionFlag detection_flags = resources.getFirstOfType <tokenDetectionFlag>(); // new tokenDetectionFlags();

            contentTokenCollection tokenCollections = resources.getFirstOfType <contentTokenCollection>();

            if (tokenCollections == null)
            {
                tokenCollections = new contentTokenCollection();
            }

            contentMatchCollection subsentenceMatches = _setSubSentences <TS>(detection_flags, null);

            try
            {
                int subCount = 0;
                for (int dti = 0; dti < subsentenceMatches.Count; dti++)
                {
                    contentMatch dt = subsentenceMatches[subsentenceMatches.Keys.imbGetItemAt(dti).ToString()]; // subsentenceMatches[dti];

                    contentSubSentence ss = dt.element as contentSubSentence;

                    contentTokenCollection subtkns = new contentTokenCollection();
                    //var cs = ss._setTokensForSentence<T>(subtkns, subsentenceMatches, flags);
                    var cs = ss._setTokensForSentence <T>(subtkns, detection_flags);
                    //var cs = ss._setTokensForSentence<T>(tokenCollections, flags);
                    //var cs = tokenCollectionsss._set
                    //var cs = ss._setTokensForSentence<T>(flags);
                    for (int ci = 0; ci < cs.Count; ci++)
                    {
                        ss.setItem(cs[ci]);
                    }

                    //cs = ss._setTokensForSentence<T>(subtkns, subsentenceMatches);
                    // ss.items.AddRange(cs);

                    //  contentTokenCollection subtkns = ss.setTokensFromContent<T>(resources);

                    //ss.items.Add(ss);
                    //foreach (T sst in ss.items)
                    //{

                    //    tokenCollections.Add(sst);
                    //}
                    //tokenCollections.Add(ss);
                    //dt.element = ss;

                    //  subCount++;
                }

                List <IContentToken> directTokens = new List <IContentToken>();

                directTokens = _setTokensForSentence <T>(subsentenceMatches, detection_flags, tokenCollections, directTokens);

                if (directTokens != tokenCollections)
                {
                    for (int dti = 0; dti < directTokens.Count; dti++)
                    {
                        IContentToken dt = directTokens[dti];

                        T tkn = dt as T;
                        if (tkn != null)
                        {
                            tokenCollections.Add(tkn);
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                var isb = new StringBuilder();
                isb.AppendLine("tokenDetection error");
                isb.AppendLine("Target is: " + sentence.toStringSafe());
                throw;
                // devNoteManager.note(sentence, ex, isb.ToString(), "tokenDetection", devNoteType.tokenization);
            }

            foreach (var tk in tokenCollections)
            {
                //subsentenceMatches.allocated(tk.)
                setItem(tk);
            }

            // logSystem.log("set tokens from content Sentence done", logType.Notification);
            return(tokenCollections);
        }
Exemplo n.º 4
0
        /// <summary>
        /// Može da sam izvrši macroTokens ili da dobije gotove. Primenjuje subsentence algoritam i vrši standardnu detekciju tokena --- NAJBITNIJE JE STO SLAZE TOKENE/SUBSENTENCE u parent
        /// </summary>
        /// <typeparam name="T"></typeparam>
        /// <param name="resources"></param>
        /// <returns></returns>
        public virtual contentTokenCollection _setTokensForSentence <T>(params object[] resources)
            where T : IContentToken, new()
        {
            contentMatchCollection subsentenceMatches = resources.getFirstOfType <contentMatchCollection>();

            if (subsentenceMatches == null)
            {
                subsentenceMatches = new contentMatchCollection();
            }


            contentMatchCollection macroTokens = null;

            if (macroTokens == null)
            {
                macroTokens = _setMacroTokensForSentence <T>(resources);
            }

            contentTokenCollection output = resources.getFirstOfType <contentTokenCollection>();

            tokenDetectionFlag flags = resources.getFirstOfType <tokenDetectionFlag>();

            if (flags.HasFlag(tokenDetectionFlag.standardDetection))
            {
                macroTokens.Add(_select_tokenWithSplitter, tokenDetectionFlag.standardDetection);
            }


            string source = "";

            source = content;

            int      i            = 0;
            int      mx           = source.Length;
            int      sI           = 0;
            int      sLimit       = mx;
            DateTime processStart = DateTime.Now;

            while (i < mx)
            {
                try
                {
                    if (sI > sLimit)
                    {
                        aceLog.log("Content sentence tokenization broken");
                        break;
                    }
                    sI++;
                    #region LOOP

                    oneOrMore <contentMatch> cms = macroTokens.allocated(i, 1);

                    if (cms == null)
                    {
                        i = mx;
                        continue;
                    }

                    if (cms.isNothing)
                    {
                        i++;
                        continue;
                    }
                    else
                    {
                        contentMatch cm = cms.First();
                        if (cm == null)
                        {
                            i++;
                            continue;
                        }

                        i = i + cm.match.Length;

                        IContentToken newToken = new T();
                        string        mch      = cm.match.Value.Trim("#".ToCharArray());
                        newToken.sourceContent = mch;
                        newToken.content       = mch;


                        Match sp = _select_tokenSplitter.Match(mch);
                        if (sp.Success)
                        {
                            newToken.spliter = sp.Value;
                            newToken.content = newToken.content.removeEndsWith(newToken.spliter).Trim();
                        }
                        else
                        {
                            //if (cm.match.Groups.Count > 1)
                            //{
                            //    mch = cm.match.Groups[1].Value;
                            //}
                            //else
                            //{

                            //}
                            newToken.spliter = "";
                            newToken.content = mch.Trim();
                        }


                        if (DateTime.Now.Subtract(processStart).Minutes > 2)
                        {
                            aceLog.log("TOKENIZATION TIME LIMIT BROKEN !!!");
                            break;
                        }


                        IContentSentence _sentence = this;

                        if (cm.element is IContentSubSentence)
                        {
                            IContentSubSentence sub = cm.element as IContentSubSentence;
                            sub.masterToken = newToken;
                            newToken        = (IContentToken)cm.element;
                        }

                        /*
                         * if (subsentenceMatches.isAllocated(cm.match.Index, cm.match.Length))
                         * {
                         *  oneOrMore<contentMatch> subcms = subsentenceMatches.allocated(cm.match.Index, cm.match.Length);
                         *  contentMatch subcm = subcms.First();
                         *  if (subcm == null)
                         *  {
                         *      // logSystem.log("    -- -- -- sub cm is null  ", logType.Notification);
                         *  }
                         *  else
                         *  {
                         *      if (subcm.element != null)
                         *      {
                         *
                         *      }
                         *      _sentence = subcm.element as IContentSubSentence;
                         *
                         *      if (_sentence != null)
                         *      {
                         *          IContentSubSentence _subSentence = _sentence as IContentSubSentence;
                         *          newToken.flags.Add(contentTokenFlag.subsentence_inner);
                         *          _subSentence.masterToken = newToken;
                         *          newToken = (T)(_subSentence as IContentToken);
                         *
                         *
                         *
                         *
                         *          //_sentence.setItem(newToken);
                         *
                         *          //if (output.Contains(_sentence as IContentToken))
                         *          //{
                         *
                         *          //}
                         *          //else
                         *          //{
                         *
                         *          //     output.Add(_sentence as IContentToken);
                         *          //}
                         *
                         *      }
                         *      else
                         *      {
                         *         // output.Add(newToken);
                         *      }
                         *  }
                         *
                         *
                         * }*/



                        /*
                         * if (_sentence != null)
                         * {
                         *
                         *
                         *  //setItem(_sentence);
                         *
                         *
                         *  if (_sentence == this)
                         *  {
                         *
                         *      output.Add(newToken);
                         *  } else
                         *  {
                         *      setItem(newToken);
                         *      if (output.Contains(_sentence as IContentToken))
                         *      {
                         *
                         *      }
                         *      else
                         *      {
                         *         // output.Add(_sentence as IContentToken);
                         *      }
                         *
                         *  }
                         *
                         * }
                         * else
                         * {
                         *  setItem(newToken);
                         * }
                         */
                        if (cm.associatedKey != null)
                        {
                            tokenDetectionFlag fl = tokenDetectionFlag.none;
                            bool detected         = Enum.TryParse(cm.associatedKey.toStringSafe(), true, out fl);
                            newToken.detectionFlags = fl;
                        }
                        if (output.Contains(newToken))
                        {
                        }
                        else
                        {
                            if (newToken == this)
                            {
                            }
                            else
                            {
                                output.Add(newToken);
                            }
                        }
                    }

                    #endregion
                }
                catch (Exception ex)
                {
                    var isb = new StringBuilder();
                    isb.AppendLine("loop error error");
                    isb.AppendLine("Target is: i=" + i + "[mx=" + mx + "]");
                    throw new aceGeneralException(isb.ToString(), null, this, "Loop");
                    // devNoteManager.note(ex, isb.ToString(), "loop error", devNoteType.tokenization);
                }
            }


            return(output);
        }
Exemplo n.º 5
0
        //public static contentElementList tokenizeUrlAndTitle(String url, String title, String description="")
        //{
        //    contentElementList output = new contentStructure.collections.contentElementList();



        //}



        /// <summary>
        /// Pravi rečenice na osnovu HtmlNode-a i vraća kolekciju -- koristi se za glavne rečenice kao i za pod rečenice
        /// </summary>
        /// <param name="htmlNode">The HTML node.</param>
        /// <param name="parent">The parent.</param>
        /// <param name="output">The output.</param>
        /// <param name="preprocessFlags">The preprocess flags.</param>
        /// <param name="flags">The flags.</param>
        /// <returns></returns>
        public static contentTokenCollection createSentencesFromNode(this HtmlNode htmlNode, IHtmlContentElement parent,
                                                                     contentTokenCollection output         = null,
                                                                     contentPreprocessFlag preprocessFlags = contentPreprocessFlag.none,
                                                                     sentenceDetectionFlag flags           = sentenceDetectionFlag.none)
        {
            if (output == null)
            {
                output = new contentTokenCollection();
            }
            // if (preprocessFlags == null) preprocessFlags = contentPreprocessFlags.getDefaultFlags();
//            if (flags == null) flags = sentenceDetectionFlags.getDefaultFlags();


            List <HtmlNode> nodes = new List <HtmlNode>();

            if (htmlNode.HasChildNodes)
            {
                foreach (HtmlNode child in htmlNode.ChildNodes)
                {
                    if (child.isNodeAcceptable())
                    {
                        nodes.Add(child);
                    }
                }
            }
            else
            {
                nodes.Add(htmlNode);
            }


            foreach (HtmlNode child in nodes)
            {
                HtmlNode relNode = child;
                if (child.ChildNodes.Count > 0)
                {
                    htmlContentSentence    htmlSentence = new htmlContentSentence(child, "");
                    contentTokenCollection subSentences = child.createSentencesFromNode(htmlSentence, null,
                                                                                        preprocessFlags, flags);
                    output.AddRange(subSentences);
                    output.Add(htmlSentence);
                    parent.setItem(htmlSentence);

                    //subSentences.ForEach(x=>htmlSentence.items.Add(x));
                }
                else
                {
                    //if (child.ChildNodes.Count == 1)
                    //{
                    //    relNode = child.FirstChild;
                    //}
                    //if (relNode.NodeType==HtmlNodeType.Text)
                    //{
                    //    relNode = relNode.ParentNode;
                    //}
                    string input = child.InnerText.Trim();


                    if (flags.HasFlag(sentenceDetectionFlag.preprocessParagraphContent))
                    {
                        input = preprocess.process(input, preprocessFlags);
                    }

                    List <string> inputSentences = splitContentToSentences(input);

                    foreach (string _inputSentece in inputSentences)
                    {
                        if (string.IsNullOrEmpty(_inputSentece))
                        {
                        }
                        else
                        {
                            htmlContentSentence newSentence = new htmlContentSentence(relNode, _inputSentece);
                            if (_select_sentenceTerminator.IsMatch(_inputSentece))
                            {
                                newSentence.sentenceFlags |= contentSentenceFlag.regular;
                                Match m = _select_sentenceTerminator.Match(_inputSentece);
                                if (m.Success)
                                {
                                    newSentence.spliter = m.Value;
                                    newSentence.content = _inputSentece.Substring(0,
                                                                                  _inputSentece.Length -
                                                                                  newSentence.spliter.Length);
                                    newSentence.content = newSentence.content.Trim();
                                }
                            }
                            else
                            {
                                newSentence.sentenceFlags |= contentSentenceFlag.inregular;
                            }
                            output.Add(newSentence);
                            parent.setItem(newSentence);
                        }
                    }
                }
            }


            return(output);
        }