/// <summary> /// Initializes a new instance of the <see cref="htmlLinkNodeCollection"/> class. /// </summary> /// <param name="tkns">The TKNS.</param> public htmlLinkNodeCollection(contentTokenCollection tkns) { foreach (IHtmlContentElement tkn in tkns) { if (!scoped.Contains(tkn)) { var lnk = tkn.linkRootParent; if (lnk != null) { Add(lnk); } } } }
public contentSentence() { items = new contentTokenCollection(); }
/// <summary> /// Glavni metod za obradu sadrzaja jedne recenice >> prvo poziva setSubSentences, zatim setTokensForSentence /// </summary> /// <typeparam name="T"></typeparam> /// <typeparam name="TS"></typeparam> /// <param name="resources"> tokenDetectionFlags flags, contentTokenCollection contentTokenCollections</param> /// <returns></returns> public virtual contentTokenCollection setTokensFromContent <T, TS>(params object[] resources) where T : class, IContentToken, new() where TS : IContentSubSentence, new() { //logSystem.log("set tokens from content Sentence: " + sentence.content, logType.Notification); IContentSentence sentence = this; tokenDetectionFlag detection_flags = resources.getFirstOfType <tokenDetectionFlag>(); // new tokenDetectionFlags(); contentTokenCollection tokenCollections = resources.getFirstOfType <contentTokenCollection>(); if (tokenCollections == null) { tokenCollections = new contentTokenCollection(); } contentMatchCollection subsentenceMatches = _setSubSentences <TS>(detection_flags, null); try { int subCount = 0; for (int dti = 0; dti < subsentenceMatches.Count; dti++) { contentMatch dt = subsentenceMatches[subsentenceMatches.Keys.imbGetItemAt(dti).ToString()]; // subsentenceMatches[dti]; contentSubSentence ss = dt.element as contentSubSentence; contentTokenCollection subtkns = new contentTokenCollection(); //var cs = ss._setTokensForSentence<T>(subtkns, subsentenceMatches, flags); var cs = ss._setTokensForSentence <T>(subtkns, detection_flags); //var cs = ss._setTokensForSentence<T>(tokenCollections, flags); //var cs = tokenCollectionsss._set //var cs = ss._setTokensForSentence<T>(flags); for (int ci = 0; ci < cs.Count; ci++) { ss.setItem(cs[ci]); } //cs = ss._setTokensForSentence<T>(subtkns, subsentenceMatches); // ss.items.AddRange(cs); // contentTokenCollection subtkns = ss.setTokensFromContent<T>(resources); //ss.items.Add(ss); //foreach (T sst in ss.items) //{ // tokenCollections.Add(sst); //} //tokenCollections.Add(ss); //dt.element = ss; // subCount++; } List <IContentToken> directTokens = new List <IContentToken>(); directTokens = _setTokensForSentence <T>(subsentenceMatches, detection_flags, tokenCollections, directTokens); if (directTokens != tokenCollections) { for (int dti = 0; dti < directTokens.Count; dti++) { IContentToken dt = directTokens[dti]; T tkn = dt as T; if (tkn != null) { tokenCollections.Add(tkn); } } } } catch (Exception ex) { var isb = new StringBuilder(); isb.AppendLine("tokenDetection error"); isb.AppendLine("Target is: " + sentence.toStringSafe()); throw; // devNoteManager.note(sentence, ex, isb.ToString(), "tokenDetection", devNoteType.tokenization); } foreach (var tk in tokenCollections) { //subsentenceMatches.allocated(tk.) setItem(tk); } // logSystem.log("set tokens from content Sentence done", logType.Notification); return(tokenCollections); }
/// <summary> /// Može da sam izvrši macroTokens ili da dobije gotove. Primenjuje subsentence algoritam i vrši standardnu detekciju tokena --- NAJBITNIJE JE STO SLAZE TOKENE/SUBSENTENCE u parent /// </summary> /// <typeparam name="T"></typeparam> /// <param name="resources"></param> /// <returns></returns> public virtual contentTokenCollection _setTokensForSentence <T>(params object[] resources) where T : IContentToken, new() { contentMatchCollection subsentenceMatches = resources.getFirstOfType <contentMatchCollection>(); if (subsentenceMatches == null) { subsentenceMatches = new contentMatchCollection(); } contentMatchCollection macroTokens = null; if (macroTokens == null) { macroTokens = _setMacroTokensForSentence <T>(resources); } contentTokenCollection output = resources.getFirstOfType <contentTokenCollection>(); tokenDetectionFlag flags = resources.getFirstOfType <tokenDetectionFlag>(); if (flags.HasFlag(tokenDetectionFlag.standardDetection)) { macroTokens.Add(_select_tokenWithSplitter, tokenDetectionFlag.standardDetection); } string source = ""; source = content; int i = 0; int mx = source.Length; int sI = 0; int sLimit = mx; DateTime processStart = DateTime.Now; while (i < mx) { try { if (sI > sLimit) { aceLog.log("Content sentence tokenization broken"); break; } sI++; #region LOOP oneOrMore <contentMatch> cms = macroTokens.allocated(i, 1); if (cms == null) { i = mx; continue; } if (cms.isNothing) { i++; continue; } else { contentMatch cm = cms.First(); if (cm == null) { i++; continue; } i = i + cm.match.Length; IContentToken newToken = new T(); string mch = cm.match.Value.Trim("#".ToCharArray()); newToken.sourceContent = mch; newToken.content = mch; Match sp = _select_tokenSplitter.Match(mch); if (sp.Success) { newToken.spliter = sp.Value; newToken.content = newToken.content.removeEndsWith(newToken.spliter).Trim(); } else { //if (cm.match.Groups.Count > 1) //{ // mch = cm.match.Groups[1].Value; //} //else //{ //} newToken.spliter = ""; newToken.content = mch.Trim(); } if (DateTime.Now.Subtract(processStart).Minutes > 2) { aceLog.log("TOKENIZATION TIME LIMIT BROKEN !!!"); break; } IContentSentence _sentence = this; if (cm.element is IContentSubSentence) { IContentSubSentence sub = cm.element as IContentSubSentence; sub.masterToken = newToken; newToken = (IContentToken)cm.element; } /* * if (subsentenceMatches.isAllocated(cm.match.Index, cm.match.Length)) * { * oneOrMore<contentMatch> subcms = subsentenceMatches.allocated(cm.match.Index, cm.match.Length); * contentMatch subcm = subcms.First(); * if (subcm == null) * { * // logSystem.log(" -- -- -- sub cm is null ", logType.Notification); * } * else * { * if (subcm.element != null) * { * * } * _sentence = subcm.element as IContentSubSentence; * * if (_sentence != null) * { * IContentSubSentence _subSentence = _sentence as IContentSubSentence; * newToken.flags.Add(contentTokenFlag.subsentence_inner); * _subSentence.masterToken = newToken; * newToken = (T)(_subSentence as IContentToken); * * * * * //_sentence.setItem(newToken); * * //if (output.Contains(_sentence as IContentToken)) * //{ * * //} * //else * //{ * * // output.Add(_sentence as IContentToken); * //} * * } * else * { * // output.Add(newToken); * } * } * * * }*/ /* * if (_sentence != null) * { * * * //setItem(_sentence); * * * if (_sentence == this) * { * * output.Add(newToken); * } else * { * setItem(newToken); * if (output.Contains(_sentence as IContentToken)) * { * * } * else * { * // output.Add(_sentence as IContentToken); * } * * } * * } * else * { * setItem(newToken); * } */ if (cm.associatedKey != null) { tokenDetectionFlag fl = tokenDetectionFlag.none; bool detected = Enum.TryParse(cm.associatedKey.toStringSafe(), true, out fl); newToken.detectionFlags = fl; } if (output.Contains(newToken)) { } else { if (newToken == this) { } else { output.Add(newToken); } } } #endregion } catch (Exception ex) { var isb = new StringBuilder(); isb.AppendLine("loop error error"); isb.AppendLine("Target is: i=" + i + "[mx=" + mx + "]"); throw new aceGeneralException(isb.ToString(), null, this, "Loop"); // devNoteManager.note(ex, isb.ToString(), "loop error", devNoteType.tokenization); } } return(output); }
//public static contentElementList tokenizeUrlAndTitle(String url, String title, String description="") //{ // contentElementList output = new contentStructure.collections.contentElementList(); //} /// <summary> /// Pravi rečenice na osnovu HtmlNode-a i vraća kolekciju -- koristi se za glavne rečenice kao i za pod rečenice /// </summary> /// <param name="htmlNode">The HTML node.</param> /// <param name="parent">The parent.</param> /// <param name="output">The output.</param> /// <param name="preprocessFlags">The preprocess flags.</param> /// <param name="flags">The flags.</param> /// <returns></returns> public static contentTokenCollection createSentencesFromNode(this HtmlNode htmlNode, IHtmlContentElement parent, contentTokenCollection output = null, contentPreprocessFlag preprocessFlags = contentPreprocessFlag.none, sentenceDetectionFlag flags = sentenceDetectionFlag.none) { if (output == null) { output = new contentTokenCollection(); } // if (preprocessFlags == null) preprocessFlags = contentPreprocessFlags.getDefaultFlags(); // if (flags == null) flags = sentenceDetectionFlags.getDefaultFlags(); List <HtmlNode> nodes = new List <HtmlNode>(); if (htmlNode.HasChildNodes) { foreach (HtmlNode child in htmlNode.ChildNodes) { if (child.isNodeAcceptable()) { nodes.Add(child); } } } else { nodes.Add(htmlNode); } foreach (HtmlNode child in nodes) { HtmlNode relNode = child; if (child.ChildNodes.Count > 0) { htmlContentSentence htmlSentence = new htmlContentSentence(child, ""); contentTokenCollection subSentences = child.createSentencesFromNode(htmlSentence, null, preprocessFlags, flags); output.AddRange(subSentences); output.Add(htmlSentence); parent.setItem(htmlSentence); //subSentences.ForEach(x=>htmlSentence.items.Add(x)); } else { //if (child.ChildNodes.Count == 1) //{ // relNode = child.FirstChild; //} //if (relNode.NodeType==HtmlNodeType.Text) //{ // relNode = relNode.ParentNode; //} string input = child.InnerText.Trim(); if (flags.HasFlag(sentenceDetectionFlag.preprocessParagraphContent)) { input = preprocess.process(input, preprocessFlags); } List <string> inputSentences = splitContentToSentences(input); foreach (string _inputSentece in inputSentences) { if (string.IsNullOrEmpty(_inputSentece)) { } else { htmlContentSentence newSentence = new htmlContentSentence(relNode, _inputSentece); if (_select_sentenceTerminator.IsMatch(_inputSentece)) { newSentence.sentenceFlags |= contentSentenceFlag.regular; Match m = _select_sentenceTerminator.Match(_inputSentece); if (m.Success) { newSentence.spliter = m.Value; newSentence.content = _inputSentece.Substring(0, _inputSentece.Length - newSentence.spliter.Length); newSentence.content = newSentence.content.Trim(); } } else { newSentence.sentenceFlags |= contentSentenceFlag.inregular; } output.Add(newSentence); parent.setItem(newSentence); } } } } return(output); }