/// <summary> /// GLAVNA KOMANDA KOD TOKENIZACIJE - Za prosledjen paragraph pravi recenice, podrecenice i tokene. Vrsi registrovanje tokena i recenica u IContentPage output-u ako bude prosledjen /// </summary> /// <typeparam name="TSentence">Tip za recenice</typeparam> /// <typeparam name="TSubSentence">Tip za pod recenice</typeparam> /// <typeparam name="TToken">Tip za tokene</typeparam> /// <param name="paragraph"></param> /// <param name="resources">IContentPage za registraciju sadrzaja; paragraphDetectionFlags; sentenceDetectionFlags; contentPreprocessFlags;tokenDetectionFlags;tokenDetectionFlags</param> public virtual void setParagraphFromContent <TSentence, TSubSentence, TToken>(params object[] resources) where TSentence : IContentSentence, new() where TSubSentence : IContentSubSentence, new() where TToken : class, IContentToken, new() { IContentPage output = resources.getFirstOfType <IContentPage>(); basicLanguage basicLanguages = resources.getFirstOfType <basicLanguage>(); if (basicLanguages == null) { basicLanguages = new basicLanguage(); } // IContentBlock block = resources.getOfType<IContentBlock>(); paragraphDetectionFlag flags = resources.getFirstOfType <paragraphDetectionFlag>(); sentenceDetectionFlag sentenceFlags = resources.getFirstOfType <sentenceDetectionFlag>(); contentPreprocessFlag preprocessFlags = resources.getFirstOfType <contentPreprocessFlag>(); // subsentenceDetectionFlags subsentenceFlags = new subsentenceDetectionFlags(resources); tokenDetectionFlag tokenFlags = resources.getFirstOfType <tokenDetectionFlag>(); // new tokenDetectionFlags(resources); contentSentenceCollection snt = _setSentencesFromContent <TSentence>(sentenceFlags, preprocessFlags); // sentenceDetection._setSentencesFromContent<TSentence>(paragraph, sentenceFlags, preprocessFlags); foreach (TSentence sn in snt) { // sn._setTokensForSentence<TSubSentence>(sentenceFlags, tokenFlags); var tkns = sn.setTokensFromContent <TToken, TSubSentence>(flags, sentenceFlags, preprocessFlags, tokenFlags, resources, basicLanguages); //tokenDetection.setTokensFromContent<TToken, TSubSentence>(sn, subsentenceFlags, tokenFlags); if (flags.HasFlag(paragraphDetectionFlag.dropSentenceWithNoToken)) { if (sn.items.Count == 0) { continue; } } if (sentenceFlags.HasFlag(sentenceDetectionFlag.setSentenceToParagraph)) { setItem(sn); } //if (output != null) //{ // output.sentences.Add(sn); // output.tokens.CollectAll(sn.items); //} } }
/// <summary> /// /// </summary> /// <typeparam name="TSentence"></typeparam> /// <param name="resources"></param> /// <returns></returns> protected virtual contentSentenceCollection _setSentencesFromContent <TSentence>(params object[] resources) where TSentence : IContentSentence, new() { string input = content; sentenceDetectionFlag flags = resources.getFirstOfType <sentenceDetectionFlag>(); //new sentenceDetectionFlags(resources); contentPreprocessFlag preprocessFlags = resources.getFirstOfType <contentPreprocessFlag>(); // new contentPreprocessFlags(resources); contentSentenceCollection output = new contentSentenceCollection(); // preuzima parent page ako je prosledjen IContentPage parentPage = resources.getFirstOfType <IContentPage>(); if (flags.HasFlag(sentenceDetectionFlag.preprocessParagraphContent)) { input = preprocess.process(input, preprocessFlags); } List <string> inputSentences = splitContentToSentences(input); foreach (string _inputSentece in inputSentences) { TSentence newSentence = new TSentence(); newSentence.sourceContent = _inputSentece; newSentence.content = _inputSentece; if (_select_sentenceTerminator.IsMatch(_inputSentece)) { newSentence.sentenceFlags |= contentSentenceFlag.regular; Match m = _select_sentenceTerminator.Match(_inputSentece); if (m.Success) { newSentence.spliter = m.Value; newSentence.content = _inputSentece.Substring(0, _inputSentece.Length - newSentence.spliter.Length); } } else { newSentence.sentenceFlags |= contentSentenceFlag.inregular; } output.Add(newSentence); } return(output); }
/// <summary> /// Postavlja tokene u prosledjenu recenicu i vraca listu svih tokena /// </summary> /// <param name="content"></param> /// <param name="sentence"></param> /// <param name="language"></param> /// <returns></returns> internal static List <T> setTokensFromContent <T, TS>(params object[] resources) where T : class, IContentToken, new() where TS : IContentSubSentence, new() { //logSystem.log("set tokens from content Sentence: " + sentence.content, logType.Notification); IContentSentence sentence = resources.getFirstOfType <IContentSentence>(); contentPreprocessFlag preprocessFlags = resources.getFirstOfType <contentPreprocessFlag>(); subsentenceDetectionFlag subflags = resources.getFirstOfType <subsentenceDetectionFlag>(); tokenDetectionFlag flags = resources.getFirstOfType <tokenDetectionFlag>(); //tokenDetectionFlag[] _flags List <T> output = new List <T>(); try { //subsentenceDetectionFlags subflags = _subflags; // tokenDetectionFlags flags = _flags; string pcontent = preprocess.process(sentence.content, preprocessFlags); contentMatchCollection subsentenceMatches = subsentenceDetection.setSubSentences <TS>(sentence, subflags); foreach (contentMatch dt in subsentenceMatches.Values) { IContentSubSentence ss = dt.element as IContentSubSentence; sentence.items.Add(ss); foreach (T sst in ss.items) { output.Add(sst); } //output.AddRange(ss.items); } List <IContentToken> directTokens = new List <IContentToken>(); directTokens = setTokensForSentence <T>(sentence, true, flags, subsentenceMatches); if (directTokens != null) { foreach (IContentToken dt in directTokens) { T tkn = dt as T; if (tkn != null) { output.Add(tkn); } } } else { } sentence.content = pcontent; } catch (Exception ex) { var isb = new StringBuilder(); isb.AppendLine("tokenDetection error"); isb.AppendLine("Target is: " + sentence.toStringSafe()); throw; // devNoteManager.note(sentence, ex, isb.ToString(), "tokenDetection", devNoteType.tokenization); } // logSystem.log("set tokens from content Sentence done", logType.Notification); return(output); }
public static string process(string content, contentPreprocessFlag _flags) { contentPreprocessFlag flags = _flags; var flist = _flags.getEnumListFromFlags(); if (string.IsNullOrEmpty(content)) { return(""); } string output = content; string tmp = ""; foreach (contentPreprocessFlag _flag in flist) { switch (_flag) { case contentPreprocessFlag.quoteStandardization: output = output.Replace("„", "\""); output = output.Replace(",,", "\""); output = output.Replace("''", "\""); output = output.Replace("``", "\""); break; case contentPreprocessFlag.acronimStandardization: output = _select_acronimWithDots.Replace(output, _replace_acronimWithDots); output = _select_acronimWithDotsAndSpaces.Replace(output, _replace_acronimWithDots); break; case contentPreprocessFlag.yearOrdinal: output = _select_yearOrdinalInGramarCase.Replace(output, _replace_yearOrdinalInGramarCase); break; case contentPreprocessFlag.enbraceStandardize: // output = _select_enbraceAllTypes.Replace(output, _replace_enbraceAllTypes); break; case contentPreprocessFlag.deentitize: //output = output.imbHtmlDecode(); break; case contentPreprocessFlag.internationalStandardsFormat: output = _select_standardsFormatting.Replace(output, _replace_standardsFormatting); break; } } /// drugi prolaz -- foreach (contentPreprocessFlag _flag in flist) { // logSystem.log("Processing: " + _flag.ToString(), logType.Notification); switch (_flag) { case contentPreprocessFlag.titleStandardize: output = _select_titleAllCapital.Replace(output, _replace_titleAllCapital); output = _select_titleFirstCapital.Replace(output, _replace_titleAllCapital); break; } } return(output); }
//public static contentElementList tokenizeUrlAndTitle(String url, String title, String description="") //{ // contentElementList output = new contentStructure.collections.contentElementList(); //} /// <summary> /// Pravi rečenice na osnovu HtmlNode-a i vraća kolekciju -- koristi se za glavne rečenice kao i za pod rečenice /// </summary> /// <param name="htmlNode">The HTML node.</param> /// <param name="parent">The parent.</param> /// <param name="output">The output.</param> /// <param name="preprocessFlags">The preprocess flags.</param> /// <param name="flags">The flags.</param> /// <returns></returns> public static contentTokenCollection createSentencesFromNode(this HtmlNode htmlNode, IHtmlContentElement parent, contentTokenCollection output = null, contentPreprocessFlag preprocessFlags = contentPreprocessFlag.none, sentenceDetectionFlag flags = sentenceDetectionFlag.none) { if (output == null) { output = new contentTokenCollection(); } // if (preprocessFlags == null) preprocessFlags = contentPreprocessFlags.getDefaultFlags(); // if (flags == null) flags = sentenceDetectionFlags.getDefaultFlags(); List <HtmlNode> nodes = new List <HtmlNode>(); if (htmlNode.HasChildNodes) { foreach (HtmlNode child in htmlNode.ChildNodes) { if (child.isNodeAcceptable()) { nodes.Add(child); } } } else { nodes.Add(htmlNode); } foreach (HtmlNode child in nodes) { HtmlNode relNode = child; if (child.ChildNodes.Count > 0) { htmlContentSentence htmlSentence = new htmlContentSentence(child, ""); contentTokenCollection subSentences = child.createSentencesFromNode(htmlSentence, null, preprocessFlags, flags); output.AddRange(subSentences); output.Add(htmlSentence); parent.setItem(htmlSentence); //subSentences.ForEach(x=>htmlSentence.items.Add(x)); } else { //if (child.ChildNodes.Count == 1) //{ // relNode = child.FirstChild; //} //if (relNode.NodeType==HtmlNodeType.Text) //{ // relNode = relNode.ParentNode; //} string input = child.InnerText.Trim(); if (flags.HasFlag(sentenceDetectionFlag.preprocessParagraphContent)) { input = preprocess.process(input, preprocessFlags); } List <string> inputSentences = splitContentToSentences(input); foreach (string _inputSentece in inputSentences) { if (string.IsNullOrEmpty(_inputSentece)) { } else { htmlContentSentence newSentence = new htmlContentSentence(relNode, _inputSentece); if (_select_sentenceTerminator.IsMatch(_inputSentece)) { newSentence.sentenceFlags |= contentSentenceFlag.regular; Match m = _select_sentenceTerminator.Match(_inputSentece); if (m.Success) { newSentence.spliter = m.Value; newSentence.content = _inputSentece.Substring(0, _inputSentece.Length - newSentence.spliter.Length); newSentence.content = newSentence.content.Trim(); } } else { newSentence.sentenceFlags |= contentSentenceFlag.inregular; } output.Add(newSentence); parent.setItem(newSentence); } } } } return(output); }