示例#1
0
        /// <summary>
        /// GLAVNA KOMANDA KOD TOKENIZACIJE - Za prosledjen paragraph pravi recenice, podrecenice i tokene. Vrsi registrovanje tokena i recenica u IContentPage output-u ako bude prosledjen
        /// </summary>
        /// <typeparam name="TSentence">Tip za recenice</typeparam>
        /// <typeparam name="TSubSentence">Tip za pod recenice</typeparam>
        /// <typeparam name="TToken">Tip za tokene</typeparam>
        /// <param name="paragraph"></param>
        /// <param name="resources">IContentPage za registraciju sadrzaja;  paragraphDetectionFlags; sentenceDetectionFlags; contentPreprocessFlags;tokenDetectionFlags;tokenDetectionFlags</param>
        public virtual void setParagraphFromContent <TSentence, TSubSentence, TToken>(params object[] resources)
            where TSentence : IContentSentence, new()
            where TSubSentence : IContentSubSentence, new()
            where TToken : class, IContentToken, new()
        {
            IContentPage output = resources.getFirstOfType <IContentPage>();

            basicLanguage basicLanguages = resources.getFirstOfType <basicLanguage>();

            if (basicLanguages == null)
            {
                basicLanguages = new basicLanguage();
            }

            // IContentBlock block = resources.getOfType<IContentBlock>();

            paragraphDetectionFlag flags           = resources.getFirstOfType <paragraphDetectionFlag>();
            sentenceDetectionFlag  sentenceFlags   = resources.getFirstOfType <sentenceDetectionFlag>();
            contentPreprocessFlag  preprocessFlags = resources.getFirstOfType <contentPreprocessFlag>();
            //  subsentenceDetectionFlags subsentenceFlags = new subsentenceDetectionFlags(resources);
            tokenDetectionFlag tokenFlags = resources.getFirstOfType <tokenDetectionFlag>(); // new tokenDetectionFlags(resources);

            contentSentenceCollection snt = _setSentencesFromContent <TSentence>(sentenceFlags, preprocessFlags);

            // sentenceDetection._setSentencesFromContent<TSentence>(paragraph, sentenceFlags, preprocessFlags);

            foreach (TSentence sn in snt)
            {
                // sn._setTokensForSentence<TSubSentence>(sentenceFlags, tokenFlags);
                var tkns = sn.setTokensFromContent <TToken, TSubSentence>(flags, sentenceFlags, preprocessFlags,
                                                                          tokenFlags, resources, basicLanguages);

                //tokenDetection.setTokensFromContent<TToken, TSubSentence>(sn, subsentenceFlags, tokenFlags);

                if (flags.HasFlag(paragraphDetectionFlag.dropSentenceWithNoToken))
                {
                    if (sn.items.Count == 0)
                    {
                        continue;
                    }
                }
                if (sentenceFlags.HasFlag(sentenceDetectionFlag.setSentenceToParagraph))
                {
                    setItem(sn);
                }

                //if (output != null)
                //{
                //    output.sentences.Add(sn);
                //    output.tokens.CollectAll(sn.items);
                //}
            }
        }
示例#2
0
        /// <summary>
        ///
        /// </summary>
        /// <typeparam name="TSentence"></typeparam>
        /// <param name="resources"></param>
        /// <returns></returns>
        protected virtual contentSentenceCollection _setSentencesFromContent <TSentence>(params object[] resources)
            where TSentence : IContentSentence, new()
        {
            string input = content;

            sentenceDetectionFlag     flags           = resources.getFirstOfType <sentenceDetectionFlag>(); //new sentenceDetectionFlags(resources);
            contentPreprocessFlag     preprocessFlags = resources.getFirstOfType <contentPreprocessFlag>(); // new contentPreprocessFlags(resources);
            contentSentenceCollection output          = new contentSentenceCollection();

            // preuzima parent page ako je prosledjen
            IContentPage parentPage = resources.getFirstOfType <IContentPage>();


            if (flags.HasFlag(sentenceDetectionFlag.preprocessParagraphContent))
            {
                input = preprocess.process(input, preprocessFlags);
            }


            List <string> inputSentences = splitContentToSentences(input);

            foreach (string _inputSentece in inputSentences)
            {
                TSentence newSentence = new TSentence();
                newSentence.sourceContent = _inputSentece;
                newSentence.content       = _inputSentece;
                if (_select_sentenceTerminator.IsMatch(_inputSentece))
                {
                    newSentence.sentenceFlags |= contentSentenceFlag.regular;
                    Match m = _select_sentenceTerminator.Match(_inputSentece);
                    if (m.Success)
                    {
                        newSentence.spliter = m.Value;
                        newSentence.content = _inputSentece.Substring(0,
                                                                      _inputSentece.Length - newSentence.spliter.Length);
                    }
                }
                else
                {
                    newSentence.sentenceFlags |= contentSentenceFlag.inregular;
                }
                output.Add(newSentence);
            }


            return(output);
        }
        /// <summary>
        /// Postavlja tokene u prosledjenu recenicu i vraca listu svih tokena
        /// </summary>
        /// <param name="content"></param>
        /// <param name="sentence"></param>
        /// <param name="language"></param>
        /// <returns></returns>
        internal static List <T> setTokensFromContent <T, TS>(params object[] resources)
            where T : class, IContentToken, new()
            where TS : IContentSubSentence, new()
        {
            //logSystem.log("set tokens from content Sentence: " + sentence.content, logType.Notification);
            IContentSentence         sentence        = resources.getFirstOfType <IContentSentence>();
            contentPreprocessFlag    preprocessFlags = resources.getFirstOfType <contentPreprocessFlag>();
            subsentenceDetectionFlag subflags        = resources.getFirstOfType <subsentenceDetectionFlag>();
            tokenDetectionFlag       flags           = resources.getFirstOfType <tokenDetectionFlag>();

            //tokenDetectionFlag[] _flags

            List <T> output = new List <T>();

            try
            {
                //subsentenceDetectionFlags subflags = _subflags;
                // tokenDetectionFlags flags = _flags;

                string pcontent = preprocess.process(sentence.content, preprocessFlags);

                contentMatchCollection subsentenceMatches = subsentenceDetection.setSubSentences <TS>(sentence, subflags);

                foreach (contentMatch dt in subsentenceMatches.Values)
                {
                    IContentSubSentence ss = dt.element as IContentSubSentence;
                    sentence.items.Add(ss);
                    foreach (T sst in ss.items)
                    {
                        output.Add(sst);
                    }
                    //output.AddRange(ss.items);
                }

                List <IContentToken> directTokens = new List <IContentToken>();

                directTokens = setTokensForSentence <T>(sentence, true, flags, subsentenceMatches);

                if (directTokens != null)
                {
                    foreach (IContentToken dt in directTokens)
                    {
                        T tkn = dt as T;
                        if (tkn != null)
                        {
                            output.Add(tkn);
                        }
                    }
                }
                else
                {
                }

                sentence.content = pcontent;
            }
            catch (Exception ex)
            {
                var isb = new StringBuilder();
                isb.AppendLine("tokenDetection error");
                isb.AppendLine("Target is: " + sentence.toStringSafe());
                throw;
                // devNoteManager.note(sentence, ex, isb.ToString(), "tokenDetection", devNoteType.tokenization);
            }

            // logSystem.log("set tokens from content Sentence done", logType.Notification);
            return(output);
        }
示例#4
0
        public static string process(string content, contentPreprocessFlag _flags)
        {
            contentPreprocessFlag flags = _flags;
            var flist = _flags.getEnumListFromFlags();

            if (string.IsNullOrEmpty(content))
            {
                return("");
            }

            string output = content;
            string tmp    = "";

            foreach (contentPreprocessFlag _flag in flist)
            {
                switch (_flag)
                {
                case contentPreprocessFlag.quoteStandardization:
                    output = output.Replace("„", "\"");
                    output = output.Replace(",,", "\"");
                    output = output.Replace("''", "\"");
                    output = output.Replace("``", "\"");
                    break;

                case contentPreprocessFlag.acronimStandardization:
                    output = _select_acronimWithDots.Replace(output, _replace_acronimWithDots);
                    output = _select_acronimWithDotsAndSpaces.Replace(output, _replace_acronimWithDots);
                    break;

                case contentPreprocessFlag.yearOrdinal:
                    output = _select_yearOrdinalInGramarCase.Replace(output, _replace_yearOrdinalInGramarCase);
                    break;

                case contentPreprocessFlag.enbraceStandardize:
                    // output = _select_enbraceAllTypes.Replace(output, _replace_enbraceAllTypes);
                    break;

                case contentPreprocessFlag.deentitize:
                    //output = output.imbHtmlDecode();
                    break;

                case contentPreprocessFlag.internationalStandardsFormat:
                    output = _select_standardsFormatting.Replace(output, _replace_standardsFormatting);
                    break;
                }
            }

            /// drugi prolaz --
            foreach (contentPreprocessFlag _flag in flist)
            {
                // logSystem.log("Processing: " + _flag.ToString(), logType.Notification);
                switch (_flag)
                {
                case contentPreprocessFlag.titleStandardize:
                    output = _select_titleAllCapital.Replace(output, _replace_titleAllCapital);
                    output = _select_titleFirstCapital.Replace(output, _replace_titleAllCapital);
                    break;
                }
            }

            return(output);
        }
示例#5
0
        //public static contentElementList tokenizeUrlAndTitle(String url, String title, String description="")
        //{
        //    contentElementList output = new contentStructure.collections.contentElementList();



        //}



        /// <summary>
        /// Pravi rečenice na osnovu HtmlNode-a i vraća kolekciju -- koristi se za glavne rečenice kao i za pod rečenice
        /// </summary>
        /// <param name="htmlNode">The HTML node.</param>
        /// <param name="parent">The parent.</param>
        /// <param name="output">The output.</param>
        /// <param name="preprocessFlags">The preprocess flags.</param>
        /// <param name="flags">The flags.</param>
        /// <returns></returns>
        public static contentTokenCollection createSentencesFromNode(this HtmlNode htmlNode, IHtmlContentElement parent,
                                                                     contentTokenCollection output         = null,
                                                                     contentPreprocessFlag preprocessFlags = contentPreprocessFlag.none,
                                                                     sentenceDetectionFlag flags           = sentenceDetectionFlag.none)
        {
            if (output == null)
            {
                output = new contentTokenCollection();
            }
            // if (preprocessFlags == null) preprocessFlags = contentPreprocessFlags.getDefaultFlags();
//            if (flags == null) flags = sentenceDetectionFlags.getDefaultFlags();


            List <HtmlNode> nodes = new List <HtmlNode>();

            if (htmlNode.HasChildNodes)
            {
                foreach (HtmlNode child in htmlNode.ChildNodes)
                {
                    if (child.isNodeAcceptable())
                    {
                        nodes.Add(child);
                    }
                }
            }
            else
            {
                nodes.Add(htmlNode);
            }


            foreach (HtmlNode child in nodes)
            {
                HtmlNode relNode = child;
                if (child.ChildNodes.Count > 0)
                {
                    htmlContentSentence    htmlSentence = new htmlContentSentence(child, "");
                    contentTokenCollection subSentences = child.createSentencesFromNode(htmlSentence, null,
                                                                                        preprocessFlags, flags);
                    output.AddRange(subSentences);
                    output.Add(htmlSentence);
                    parent.setItem(htmlSentence);

                    //subSentences.ForEach(x=>htmlSentence.items.Add(x));
                }
                else
                {
                    //if (child.ChildNodes.Count == 1)
                    //{
                    //    relNode = child.FirstChild;
                    //}
                    //if (relNode.NodeType==HtmlNodeType.Text)
                    //{
                    //    relNode = relNode.ParentNode;
                    //}
                    string input = child.InnerText.Trim();


                    if (flags.HasFlag(sentenceDetectionFlag.preprocessParagraphContent))
                    {
                        input = preprocess.process(input, preprocessFlags);
                    }

                    List <string> inputSentences = splitContentToSentences(input);

                    foreach (string _inputSentece in inputSentences)
                    {
                        if (string.IsNullOrEmpty(_inputSentece))
                        {
                        }
                        else
                        {
                            htmlContentSentence newSentence = new htmlContentSentence(relNode, _inputSentece);
                            if (_select_sentenceTerminator.IsMatch(_inputSentece))
                            {
                                newSentence.sentenceFlags |= contentSentenceFlag.regular;
                                Match m = _select_sentenceTerminator.Match(_inputSentece);
                                if (m.Success)
                                {
                                    newSentence.spliter = m.Value;
                                    newSentence.content = _inputSentece.Substring(0,
                                                                                  _inputSentece.Length -
                                                                                  newSentence.spliter.Length);
                                    newSentence.content = newSentence.content.Trim();
                                }
                            }
                            else
                            {
                                newSentence.sentenceFlags |= contentSentenceFlag.inregular;
                            }
                            output.Add(newSentence);
                            parent.setItem(newSentence);
                        }
                    }
                }
            }


            return(output);
        }