/// <summary>
        /// COMMON TREE DETECTION: End Node xPath Frequency
        /// Koristi imbKeywordScoreList da bi utvrdio koje se putanje pojavljuju najviše
        /// one koje imaju poena koliko i kolekcija stranica se smatraju zajedničkom strukturom.
        /// Koristi FindXPath extenziju za uzimanje xPath putanje
        /// </summary>
        /// <param name="source">Stranice koje se testiraju - u XML strukturi</param>
        /// <param name="tolerance">Omogucava da se template-om smatraju i node xPaths koji imaju za N manje pojavljivanja</param>
        /// <returns>Listu xPath putanja za strukturu koja je zajednicka</returns>
        public static Dictionary <string, templateElement> commonTree_imbENPF(crawledPage[] source,
                                                                              imbWebTemplateSettings settings)
        {
            Dictionary <string, templateElement> output = new Dictionary <string, templateElement>();

            if (source.Length == 0)
            {
                logSystem.log("No pages supplied for common tree detection!", logType.FatalError);
                return(output);
            }

            //  imbKeywordScoreList allPaths = new imbKeywordScoreList();
            Dictionary <string, XmlNode> allXmlNodes = new Dictionary <string, XmlNode>();

            int limit = source.Count() - settings.treeTolerance;

            imbNamespaceSetup nsSetup;

            foreach (crawledPage cpage in source)
            {
                /*
                 * nsSetup = new imbNamespaceSetup(cpage.xmlDocument);
                 * List<XmlNode> _endNodes = cpage.xmlDocument.FirstChild.collectChildren(collectRelatives.endNodes, 0);
                 * foreach (XmlNode node in _endNodes)
                 * {
                 *  String xp = node.FindXPath(nsSetup);
                 *
                 *  allPaths.addKeyword(xp);
                 *  if (!allXmlNodes.ContainsKey(xp))
                 *  {
                 *      allXmlNodes.Add(xp, node);
                 *  }
                 * }
                 */
            }

            //  imbKeywordScore[] limited = .Where<imbKeywordScore>(x => x.score >= limit) as imbKeywordScore[];

            //foreach (imbKeywordScore ki in allPaths)
            //{
            //    if (ki.score >= limit)
            //    {
            //        output.Add(ki.keyword, templateElement.makeElement(ki.keyword, allXmlNodes[ki.keyword]));
            //    }
            //}

            logSystem.log("Pronađeno ukupno [" + output.Count() + "] elemenata", logType.Notification);

            return(output);
        }
Exemple #2
0
 /// <summary>
 /// Deo imbBasic metodologije
 /// </summary>
 /// <param name="template">Definicija templejta ciji sadržaj treba podesiti</param>
 /// <param name="settings">Podešavanja</param>
 public static void prepareContent(templateDefinition template, imbWebTemplateSettings settings)
 {
     /*
      * StringBuilder textMaker = new StringBuilder();
      * template.templateXML = new XmlDocument(nsSetup.namespaceManager.NameTable);
      *
      * String tmp = template.templateXML.OuterXml;
      *
      * template.templateXML.Prefix = nsSetup.nsPrefix;
      *
      *
      *
      * String basicXML = "<?xml version=\"1.0\" encoding=\"utf-16\"?>" + Environment.NewLine;
      * basicXML += "<span>" + Environment.NewLine;
      * basicXML += "<html xmlns=\"" + nsSetup.nsSourceUrl + "\" >" + Environment.NewLine;
      * basicXML += "</html>" + Environment.NewLine;
      * basicXML += "</span>" + Environment.NewLine;
      *
      * template.templateXML.LoadXml(basicXML);
      *
      *
      *
      *
      * foreach (KeyValuePair<String, templateElement> el in template.xPathStruktura)
      * {
      *  if (settings.doMakeTEXT)
      *  {
      *
      *      textMaker.AppendLine(el.Value.content);
      *  }
      *  if (settings.doMakeXML)
      *  {
      *     // imbXmlExtendedTools.makeNodeByxPath(template.templateXML, template.templateXML.DocumentElement, el.Key, el.Value.source, nsSetup);
      *  }
      * }
      *
      * if (settings.doMakeHTML)
      * {
      *  template.templateHTML = template.templateXML.OuterXml;
      * }
      * */
 }
        /// <summary>
        /// Description of $property$
        /// </summary>
        public static string makeUniKey(crawledPage[] input, imbWebTemplateSettings settings,
                                        templateDefinition template)
        {
            string output = ""; // = new String();

            //imbKeywordScoreList nameList = new imbKeywordScoreList();

            //switch (settings.uniKeyMode)
            //{
            //    case uniKeyCreation.idToString:
            //        output = template.id.ToString("D3");
            //        break;
            //    case uniKeyCreation.tokenizePageTitles:
            //        foreach (crawledPage p in input)
            //        {
            //            nameList.addText(p.pageCaption, false, imbNLPengine.imbBasic);
            //        }
            //        break;
            //    case uniKeyCreation.tokenizeTemplateContent:
            //        foreach (crawledPage p in input)
            //        {
            //            nameList.addText(template.templateHTML, false, imbNLPengine.imbBasic);
            //        }
            //        break;
            //}
            //if (output == "")
            //{
            //   nameList.sort();
            //   output = imbCollectionHelpers.imbGetFirstValue<String>(nameList.getStringList(), "", false, 0);
            //   output = output.TrimToMaxLength(10, "");
            //}

            //logSystem.log("UniKey created (" + settings.uniKeyMode + ") = " + output, logType.Execution);

            return(output);
        }
        /// <summary>
        /// Izvršava imbBasic metod detekcije templejta
        /// </summary>
        /// <param name="source">Lista učitanih stranica</param>
        /// <param name="settings">Podešavanja</param>
        /// <returns>Sređena definicija templejta</returns>
        public static templateDefinition detectTemplate_imbBasic(crawledPage[] source, imbWebTemplateSettings settings)
        {
            templateDefinition output = new templateDefinition();

            List <string> xPathList;

            // COMMON TREE DETECTION
            switch (settings.commonTreeDetection)
            {
            default:
            case commonTreeMethod.imbEndNodePathFrequency:
                output.xPathStruktura = templateOperations.commonTree_imbENPF(source, settings);
                break;
            }

            crawledPage c = source.First();

            //imbNamespaceSetup nsSetup = new imbNamespaceSetup(c.xmlDocument);


            // COMMON CONTENT CHECK
            output.xPathStruktura = templateOperations.commonContentCheck(source, output.xPathStruktura, settings);
            templateExtensions.prepareContent(output, settings);

            // page track
            if (settings.doSavePageUrls)
            {
                foreach (crawledPage p in source)
                {
                    output.relatedPages.Add(p.url);
                }
            }


            output.score = source.Length;

            if (output.xPathStruktura.Count == 0)
            {
                logSystem.log("Template detection failed!", logType.Notification);
            }

            return(output);
        }
        /// <summary>
        /// imbBasic metodologija ::
        /// Primenjuje pravila vezana za sadržaj i ako treba formira commonContent za svaki od templateElement-a
        /// </summary>
        /// <param name="source">Lista sa stranicama koje treba da se analiziraju</param>
        /// <param name="xPathList">Postojeca xPathLista</param>
        /// <param name="settings">Podesavanja</param>
        /// <returns>Umanjenu listu (ako je tako podeseno) sa definisanim zajedničkim sadržajem</returns>
        public static Dictionary <string, templateElement> commonContentCheck(crawledPage[] source,
                                                                              Dictionary <string, templateElement>
                                                                              xPathList,
                                                                              imbWebTemplateSettings settings)
        {
            Dictionary <string, templateElement> output = new Dictionary <string, templateElement>();

            //if (source.Length == 0)
            //{
            //    logSystem.log("No pages supplied for content check!", logType.Warning);
            //    return output;
            //}

            //switch (settings.contentPolicy)
            //{
            //    case commonContentPolicy.ignoreContent:

            //        return xPathList;
            //        break;
            //}


            //foreach (KeyValuePair<String, templateElement> el in xPathList)
            //{
            //    String tmpContent = null;

            //    String[] cmnContent = null; //= el.Value.content;
            //    List<String[]> contents = new List<string[]>();

            //    Boolean add = true;

            //   foreach (crawledPage p in source)
            //    {
            //        String textCon = "";
            //        XmlNode nd = imbAdvancedXPath.xPathExecution(el.Key, p.xmlDocument.DocumentElement, null, imbCore.xml.queryEngine.imbXPathQuery, true, 0);
            //        textCon = textRetriveEngine.retriveText(nd, settings.textRetriveSetup);


            //        switch (settings.contentPolicy)
            //        {
            //            default:
            //            case commonContentPolicy.onlyExactContent:
            //                if (tmpContent == null)
            //                {
            //                    tmpContent = textCon;
            //                }
            //                if (tmpContent != textCon)
            //                {
            //                    add = false;
            //                    break;
            //                }
            //                break;
            //            case commonContentPolicy.extractCommonContent:

            //                //String[] lines = imbNLPTools.defaultSplit(textCon, imbNLPengine.imbBasic, settings.contentTokenizationSettings);
            //                contents.Add(lines);
            //                break;
            //        }
            //    }

            //    if (settings.contentPolicy == commonContentPolicy.extractCommonContent)
            //    //{
            //    //    List<String> commonStrings = imbNLPTools.getCommonMembers(contents, settings.contentExtractionTolerance);
            //    //    String commonContent = imbNLPTools.defaultJoin(commonStrings.ToArray(), imbNLPengine.imbBasic, settings.contentTokenizationSettings);

            //        el.Value.content = commonContent;

            //        if (String.IsNullOrEmpty(commonContent))
            //        {
            //            add = false;
            //        }
            //    }

            //    if (add)
            //    {
            //        output.Add(el.Key, el.Value);
            //    }
            //}

            //logSystem.log(settings.contentPolicy + " :: Ulaz [" + xPathList.Count() + "] - na izlazu> [" + output.Count() + "]", logType.Execution);

            return(output);
        }