public string deploy(string value, textExtraction_structure mode, textExtractionSetup settings) { StringBuilder output = new StringBuilder(); value = value.Trim(); if (string.IsNullOrEmpty(value)) { return(""); } switch (mode) { case textExtraction_structure.ignore: break; case textExtraction_structure.newLine: output.Append(Environment.NewLine); output.Append(value); output.Append(Environment.NewLine); output.Append(Environment.NewLine); break; case textExtraction_structure.normal: output.Append(value); output.Append(Environment.NewLine); break; case textExtraction_structure.spaceInline: output.Append(value + settings.inlineSpace); break; } return(output.ToString()); }
/// <summary> /// 2014c> novi mehanizam za tekstualnu reprezentaciju ucitanog dokumenta /// </summary> /// <param name="source"></param> /// <param name="settings"></param> /// <returns></returns> public string retriveText(XPathNavigator source, textExtractionSetup settings = null) { StringBuilder output = new StringBuilder(); if (source == null) { return(""); } settings = checkSettings(settings); XPathNodeIterator itr = source.SelectDescendants(XPathNodeType.Text, true); while (itr.MoveNext()) { switch (itr.Current.NodeType) { case XPathNodeType.Text: string inner = itr.Current.Value; if (!string.IsNullOrEmpty(inner)) { var subNav = itr.Current.CreateNavigator(); if (subNav.MoveToParent()) { if (checkNode(subNav, settings)) { output.AppendLine(deploySpacing(inner, subNav, settings)); } } else { if (checkNode(subNav, settings)) { output.AppendLine(inner); } } } break; default: break; } } string out2 = output.ToString(); if (settings.doCompressNewLines) { string nnnl = Environment.NewLine + Environment.NewLine + Environment.NewLine + Environment.NewLine; string nnl = Environment.NewLine + Environment.NewLine + Environment.NewLine; // out2 = tokenization.blankLineSelector.Replace(out2, nnl); while (out2.Contains(nnnl)) { out2 = out2.Replace(nnnl, nnl); } } return(out2); }
/// <summary> /// Primenjuje podesavanja spejsinga - simulacija HTML strukture /// </summary> /// <param name="insert"></param> /// <param name="parentTag"></param> /// <param name="settings"></param> /// <returns></returns> internal string deploySpacing(string insert, XPathNavigator parentTag, textExtractionSetup settings) { string tag = parentTag.Name.ToLower(); /* * if (htmlDefinitions.HTMLTags_blockStructureTags.Contains(tag)) * return deploy(insert, settings.spanExtractMode, settings); * if (htmlDefinitions.HTMLTags_headingTags.Contains(tag)) * return deploy(insert, settings.headingExtractMode, settings); * if (htmlDefinitions.HTMLTags_tableItemTags.Contains(tag)) * return deploy(insert, settings.tdExtractMode, settings); */ return(insert); }
/// <summary> /// Proverava da li je prosledjeni node u saglasju sa podesavanjima /// </summary> /// <param name="source"></param> /// <param name="settings"></param> /// <returns></returns> internal bool checkNode(XPathNavigator source, textExtractionSetup settings) { switch (source.NodeType) { case XPathNodeType.Element: string nn = source.Name.ToLower(); switch (nn) { case "script": return(settings.doExportScripts); break; case "title": return(settings.doExportTitle); break; case "style": return(settings.doExportStyles); break; default: return(true); break; } break; case XPathNodeType.Comment: return(settings.doExportComments); break; case XPathNodeType.Whitespace: case XPathNodeType.SignificantWhitespace: return(false); break; } return(false); }
public textExtractionSetup checkSettings(textExtractionSetup settings) { if (settings == null) { var trs = new textExtractionSetup(); //var tRecord = resources.getFirstOfType<modelSpiderTestRecord> // ILogBuilder pRecordLog = resources.getFirstOfType<ILogBuilder>(false, false, false); // crawledPage cpage = resources.getOfType<crawledPage>(); trs.doExportScripts = false; trs.doExportComments = false; trs.doExportStyles = false; trs.doRetrieveChildren = false; trs.doHtmlCleanUp = true; trs.doCyrToLatTransliteration = true; return(trs); } return(settings); }
public string retriveText(IXPathNavigable source, textExtractionSetup settings = null) { return(retriveText(source.CreateNavigator(), settings)); }