Ejemplo n.º 1
0
        public string deploy(string value, textExtraction_structure mode, textExtractionSetup settings)
        {
            StringBuilder output = new StringBuilder();

            value = value.Trim();
            if (string.IsNullOrEmpty(value))
            {
                return("");
            }
            switch (mode)
            {
            case textExtraction_structure.ignore:
                break;

            case textExtraction_structure.newLine:
                output.Append(Environment.NewLine);
                output.Append(value);
                output.Append(Environment.NewLine);
                output.Append(Environment.NewLine);
                break;

            case textExtraction_structure.normal:
                output.Append(value);
                output.Append(Environment.NewLine);
                break;

            case textExtraction_structure.spaceInline:
                output.Append(value + settings.inlineSpace);
                break;
            }
            return(output.ToString());
        }
Ejemplo n.º 2
0
        /// <summary>
        /// 2014c> novi mehanizam za tekstualnu reprezentaciju ucitanog dokumenta
        /// </summary>
        /// <param name="source"></param>
        /// <param name="settings"></param>
        /// <returns></returns>
        public string retriveText(XPathNavigator source, textExtractionSetup settings = null)
        {
            StringBuilder output = new StringBuilder();

            if (source == null)
            {
                return("");
            }
            settings = checkSettings(settings);

            XPathNodeIterator itr = source.SelectDescendants(XPathNodeType.Text, true);

            while (itr.MoveNext())
            {
                switch (itr.Current.NodeType)
                {
                case XPathNodeType.Text:
                    string inner = itr.Current.Value;
                    if (!string.IsNullOrEmpty(inner))
                    {
                        var subNav = itr.Current.CreateNavigator();

                        if (subNav.MoveToParent())
                        {
                            if (checkNode(subNav, settings))
                            {
                                output.AppendLine(deploySpacing(inner, subNav, settings));
                            }
                        }
                        else
                        {
                            if (checkNode(subNav, settings))
                            {
                                output.AppendLine(inner);
                            }
                        }
                    }
                    break;

                default:
                    break;
                }
            }
            string out2 = output.ToString();

            if (settings.doCompressNewLines)
            {
                string nnnl = Environment.NewLine + Environment.NewLine + Environment.NewLine + Environment.NewLine;
                string nnl  = Environment.NewLine + Environment.NewLine + Environment.NewLine;
                // out2 = tokenization.blankLineSelector.Replace(out2, nnl);
                while (out2.Contains(nnnl))
                {
                    out2 = out2.Replace(nnnl, nnl);
                }
            }
            return(out2);
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Primenjuje podesavanja spejsinga - simulacija HTML strukture
        /// </summary>
        /// <param name="insert"></param>
        /// <param name="parentTag"></param>
        /// <param name="settings"></param>
        /// <returns></returns>
        internal string deploySpacing(string insert, XPathNavigator parentTag, textExtractionSetup settings)
        {
            string tag = parentTag.Name.ToLower();

            /*
             * if (htmlDefinitions.HTMLTags_blockStructureTags.Contains(tag))
             *  return deploy(insert, settings.spanExtractMode, settings);
             * if (htmlDefinitions.HTMLTags_headingTags.Contains(tag))
             *  return deploy(insert, settings.headingExtractMode, settings);
             * if (htmlDefinitions.HTMLTags_tableItemTags.Contains(tag))
             *  return deploy(insert, settings.tdExtractMode, settings);
             */
            return(insert);
        }
Ejemplo n.º 4
0
        /// <summary>
        /// Proverava da li je prosledjeni node u saglasju sa podesavanjima
        /// </summary>
        /// <param name="source"></param>
        /// <param name="settings"></param>
        /// <returns></returns>
        internal bool checkNode(XPathNavigator source, textExtractionSetup settings)
        {
            switch (source.NodeType)
            {
            case XPathNodeType.Element:
                string nn = source.Name.ToLower();
                switch (nn)
                {
                case "script":
                    return(settings.doExportScripts);

                    break;

                case "title":
                    return(settings.doExportTitle);

                    break;

                case "style":
                    return(settings.doExportStyles);

                    break;

                default:
                    return(true);

                    break;
                }
                break;

            case XPathNodeType.Comment:
                return(settings.doExportComments);

                break;

            case XPathNodeType.Whitespace:
            case XPathNodeType.SignificantWhitespace:
                return(false);

                break;
            }
            return(false);
        }
Ejemplo n.º 5
0
        public textExtractionSetup checkSettings(textExtractionSetup settings)
        {
            if (settings == null)
            {
                var trs = new textExtractionSetup();

                //var tRecord = resources.getFirstOfType<modelSpiderTestRecord>
                // ILogBuilder pRecordLog = resources.getFirstOfType<ILogBuilder>(false, false, false);
                // crawledPage cpage = resources.getOfType<crawledPage>();

                trs.doExportScripts           = false;
                trs.doExportComments          = false;
                trs.doExportStyles            = false;
                trs.doRetrieveChildren        = false;
                trs.doHtmlCleanUp             = true;
                trs.doCyrToLatTransliteration = true;
                return(trs);
            }
            return(settings);
        }
Ejemplo n.º 6
0
 public string retriveText(IXPathNavigable source, textExtractionSetup settings = null)
 {
     return(retriveText(source.CreateNavigator(), settings));
 }