コード例 #1
0
ファイル: nlpTokenizator.cs プロジェクト: gorangrubic/imbNLP
        //public static void prepareContentToTokenize()


        /// <summary>
        /// Univerzalni poziv za tokenizaciju. U zavisnosti od tipa T odabrace najbolji tokenizator
        /// </summary>
        /// <typeparam name="T"></typeparam>
        /// <param name="content"></param>
        /// <param name="settings"></param>
        /// <param name="semantics"></param>
        /// <returns></returns>
        public static IContentPage tokenizeContent <T>(T content, nlpTokenizatorSettings settings)
        {
            tokenizatorBase tkn;
            IContentPage    result = null;


            switch (settings.tknType)
            {
            //case tokenizationType.htmlTokenization:
            //    var htmlTkn = new htmlTokenizator(settings);
            //    return htmlTkn.tokenizeContent(content as HtmlDocument);
            //   // page.tokenizedContent = nlpTokenizator.tokenizeContent<XmlDocument>(page.xmlDocument, _crawlerAgentContext.AgentSettings.tknSettings);
            //    break;
            case tokenizationType.textTokenization:
                var textTkn = new plainTextTokenizator(settings);

                // var textTkn = new defaultTokenizator(settings);
                return(textTkn.tokenizeContent(content as string, settings.doBlockDetection));

                //page.tokenizedContent = nlpTokenizator.tokenizeContent(page.textContent, _crawlerAgentContext.AgentSettings.tknSettings);
                break;
            }


            return(result);
        }
コード例 #2
0
 public fakeTokenizator(nlpTokenizatorSettings __settings) : base(__settings)
 {
 }
コード例 #3
0
 public plainTextTokenizator(nlpTokenizatorSettings __settings) : base(__settings)
 {
 }
コード例 #4
0
 public tokenizatorBase(nlpTokenizatorSettings __settings)
 {
     settings = __settings;
 }