/// <summary> /// Primarno obelezavanje - od mikro ka makro nivoa /// </summary> /// <remarks>Obelezavanje se oslanja samo na sopstvene podatke i podatke svojih itema</remarks> /// <param name="resources"></param> public override void primaryFlaging(params object[] resources) { base.primaryFlaging(resources); // ovde ubaciti algoritam specifican za ovaj tip if (items.Count == 1) { IHtmlContentElement child = items.First() as IHtmlContentElement; if (child != null) { if (child.htmlTag == "a") { sentenceFlags |= contentSentenceFlag.navigationContainer; } } } if (htmlTag == "a") { sentenceFlags |= contentSentenceFlag.navigationLink; } if (parent is IHtmlContentElement) { IHtmlContentElement parent_IHtmlContentElement = (IHtmlContentElement)parent; if (parent_IHtmlContentElement.htmlTag == "a") { sentenceFlags |= contentSentenceFlag.titleForLink; } } //flags.Add(contentStructure.flags.contentTokenFlag.) }
public static List <IHtmlContentElement> getChildern(this IHtmlContentElement item, List <IHtmlContentElement> input, int index = 5) { List <IHtmlContentElement> output = new List <IHtmlContentElement>(); if (input == null) { input = new List <IHtmlContentElement>(); } foreach (IHtmlContentElement ch in input) { output.AddMulti(ch.items); } if (index > 0) { if (item.items.Count > 0) { output.AddMulti(item.getChildern(output, index - 1)); } } output.AddMulti(input); return(output); }
public new htmlLinkNode Add(IHtmlContentElement rootLink) { htmlLinkNode tmp = new htmlLinkNode(rootLink); Add(tmp); scoped.AddMulti(tmp.scoped); if (!byUrl.ContainsKey(tmp.url)) { byUrl.Add(tmp.url, tmp); } return(tmp); }
/// <summary> /// Returns parent that has link tag in self /// </summary> /// <param name="item">The item.</param> /// <returns></returns> public static IHtmlContentElement linkRootParent(this IHtmlContentElement item) { if (item.parent is IHtmlContentElement) { IHtmlContentElement parent_IHtmlContentElement = (IHtmlContentElement)item.parent; if (parent_IHtmlContentElement == null) { return(null); } else if (parent_IHtmlContentElement.htmlTag.ToLower() == htmlTagName.a.ToString().ToLower()) { return(parent_IHtmlContentElement); } else { return(parent_IHtmlContentElement.linkRootParent); } } else { return(null); } }
public htmlLinkNode(IHtmlContentElement __linkRoot) { linkRootParent = __linkRoot; evaluate(); }
//public static contentElementList tokenizeUrlAndTitle(String url, String title, String description="") //{ // contentElementList output = new contentStructure.collections.contentElementList(); //} /// <summary> /// Pravi rečenice na osnovu HtmlNode-a i vraća kolekciju -- koristi se za glavne rečenice kao i za pod rečenice /// </summary> /// <param name="htmlNode">The HTML node.</param> /// <param name="parent">The parent.</param> /// <param name="output">The output.</param> /// <param name="preprocessFlags">The preprocess flags.</param> /// <param name="flags">The flags.</param> /// <returns></returns> public static contentTokenCollection createSentencesFromNode(this HtmlNode htmlNode, IHtmlContentElement parent, contentTokenCollection output = null, contentPreprocessFlag preprocessFlags = contentPreprocessFlag.none, sentenceDetectionFlag flags = sentenceDetectionFlag.none) { if (output == null) { output = new contentTokenCollection(); } // if (preprocessFlags == null) preprocessFlags = contentPreprocessFlags.getDefaultFlags(); // if (flags == null) flags = sentenceDetectionFlags.getDefaultFlags(); List <HtmlNode> nodes = new List <HtmlNode>(); if (htmlNode.HasChildNodes) { foreach (HtmlNode child in htmlNode.ChildNodes) { if (child.isNodeAcceptable()) { nodes.Add(child); } } } else { nodes.Add(htmlNode); } foreach (HtmlNode child in nodes) { HtmlNode relNode = child; if (child.ChildNodes.Count > 0) { htmlContentSentence htmlSentence = new htmlContentSentence(child, ""); contentTokenCollection subSentences = child.createSentencesFromNode(htmlSentence, null, preprocessFlags, flags); output.AddRange(subSentences); output.Add(htmlSentence); parent.setItem(htmlSentence); //subSentences.ForEach(x=>htmlSentence.items.Add(x)); } else { //if (child.ChildNodes.Count == 1) //{ // relNode = child.FirstChild; //} //if (relNode.NodeType==HtmlNodeType.Text) //{ // relNode = relNode.ParentNode; //} string input = child.InnerText.Trim(); if (flags.HasFlag(sentenceDetectionFlag.preprocessParagraphContent)) { input = preprocess.process(input, preprocessFlags); } List <string> inputSentences = splitContentToSentences(input); foreach (string _inputSentece in inputSentences) { if (string.IsNullOrEmpty(_inputSentece)) { } else { htmlContentSentence newSentence = new htmlContentSentence(relNode, _inputSentece); if (_select_sentenceTerminator.IsMatch(_inputSentece)) { newSentence.sentenceFlags |= contentSentenceFlag.regular; Match m = _select_sentenceTerminator.Match(_inputSentece); if (m.Success) { newSentence.spliter = m.Value; newSentence.content = _inputSentece.Substring(0, _inputSentece.Length - newSentence.spliter.Length); newSentence.content = newSentence.content.Trim(); } } else { newSentence.sentenceFlags |= contentSentenceFlag.inregular; } output.Add(newSentence); parent.setItem(newSentence); } } } } return(output); }