//public static void prepareContentToTokenize() /// <summary> /// Univerzalni poziv za tokenizaciju. U zavisnosti od tipa T odabrace najbolji tokenizator /// </summary> /// <typeparam name="T"></typeparam> /// <param name="content"></param> /// <param name="settings"></param> /// <param name="semantics"></param> /// <returns></returns> public static IContentPage tokenizeContent <T>(T content, nlpTokenizatorSettings settings) { tokenizatorBase tkn; IContentPage result = null; switch (settings.tknType) { //case tokenizationType.htmlTokenization: // var htmlTkn = new htmlTokenizator(settings); // return htmlTkn.tokenizeContent(content as HtmlDocument); // // page.tokenizedContent = nlpTokenizator.tokenizeContent<XmlDocument>(page.xmlDocument, _crawlerAgentContext.AgentSettings.tknSettings); // break; case tokenizationType.textTokenization: var textTkn = new plainTextTokenizator(settings); // var textTkn = new defaultTokenizator(settings); return(textTkn.tokenizeContent(content as string, settings.doBlockDetection)); //page.tokenizedContent = nlpTokenizator.tokenizeContent(page.textContent, _crawlerAgentContext.AgentSettings.tknSettings); break; } return(result); }
public fakeTokenizator(nlpTokenizatorSettings __settings) : base(__settings) { }
public plainTextTokenizator(nlpTokenizatorSettings __settings) : base(__settings) { }
public tokenizatorBase(nlpTokenizatorSettings __settings) { settings = __settings; }