/// <summary> /// COMMON TREE DETECTION: End Node xPath Frequency /// Koristi imbKeywordScoreList da bi utvrdio koje se putanje pojavljuju najviše /// one koje imaju poena koliko i kolekcija stranica se smatraju zajedničkom strukturom. /// Koristi FindXPath extenziju za uzimanje xPath putanje /// </summary> /// <param name="source">Stranice koje se testiraju - u XML strukturi</param> /// <param name="tolerance">Omogucava da se template-om smatraju i node xPaths koji imaju za N manje pojavljivanja</param> /// <returns>Listu xPath putanja za strukturu koja je zajednicka</returns> public static Dictionary <string, templateElement> commonTree_imbENPF(crawledPage[] source, imbWebTemplateSettings settings) { Dictionary <string, templateElement> output = new Dictionary <string, templateElement>(); if (source.Length == 0) { logSystem.log("No pages supplied for common tree detection!", logType.FatalError); return(output); } // imbKeywordScoreList allPaths = new imbKeywordScoreList(); Dictionary <string, XmlNode> allXmlNodes = new Dictionary <string, XmlNode>(); int limit = source.Count() - settings.treeTolerance; imbNamespaceSetup nsSetup; foreach (crawledPage cpage in source) { /* * nsSetup = new imbNamespaceSetup(cpage.xmlDocument); * List<XmlNode> _endNodes = cpage.xmlDocument.FirstChild.collectChildren(collectRelatives.endNodes, 0); * foreach (XmlNode node in _endNodes) * { * String xp = node.FindXPath(nsSetup); * * allPaths.addKeyword(xp); * if (!allXmlNodes.ContainsKey(xp)) * { * allXmlNodes.Add(xp, node); * } * } */ } // imbKeywordScore[] limited = .Where<imbKeywordScore>(x => x.score >= limit) as imbKeywordScore[]; //foreach (imbKeywordScore ki in allPaths) //{ // if (ki.score >= limit) // { // output.Add(ki.keyword, templateElement.makeElement(ki.keyword, allXmlNodes[ki.keyword])); // } //} logSystem.log("Pronađeno ukupno [" + output.Count() + "] elemenata", logType.Notification); return(output); }
/// <summary> /// Deo imbBasic metodologije /// </summary> /// <param name="template">Definicija templejta ciji sadržaj treba podesiti</param> /// <param name="settings">Podešavanja</param> public static void prepareContent(templateDefinition template, imbWebTemplateSettings settings) { /* * StringBuilder textMaker = new StringBuilder(); * template.templateXML = new XmlDocument(nsSetup.namespaceManager.NameTable); * * String tmp = template.templateXML.OuterXml; * * template.templateXML.Prefix = nsSetup.nsPrefix; * * * * String basicXML = "<?xml version=\"1.0\" encoding=\"utf-16\"?>" + Environment.NewLine; * basicXML += "<span>" + Environment.NewLine; * basicXML += "<html xmlns=\"" + nsSetup.nsSourceUrl + "\" >" + Environment.NewLine; * basicXML += "</html>" + Environment.NewLine; * basicXML += "</span>" + Environment.NewLine; * * template.templateXML.LoadXml(basicXML); * * * * * foreach (KeyValuePair<String, templateElement> el in template.xPathStruktura) * { * if (settings.doMakeTEXT) * { * * textMaker.AppendLine(el.Value.content); * } * if (settings.doMakeXML) * { * // imbXmlExtendedTools.makeNodeByxPath(template.templateXML, template.templateXML.DocumentElement, el.Key, el.Value.source, nsSetup); * } * } * * if (settings.doMakeHTML) * { * template.templateHTML = template.templateXML.OuterXml; * } * */ }
/// <summary> /// Description of $property$ /// </summary> public static string makeUniKey(crawledPage[] input, imbWebTemplateSettings settings, templateDefinition template) { string output = ""; // = new String(); //imbKeywordScoreList nameList = new imbKeywordScoreList(); //switch (settings.uniKeyMode) //{ // case uniKeyCreation.idToString: // output = template.id.ToString("D3"); // break; // case uniKeyCreation.tokenizePageTitles: // foreach (crawledPage p in input) // { // nameList.addText(p.pageCaption, false, imbNLPengine.imbBasic); // } // break; // case uniKeyCreation.tokenizeTemplateContent: // foreach (crawledPage p in input) // { // nameList.addText(template.templateHTML, false, imbNLPengine.imbBasic); // } // break; //} //if (output == "") //{ // nameList.sort(); // output = imbCollectionHelpers.imbGetFirstValue<String>(nameList.getStringList(), "", false, 0); // output = output.TrimToMaxLength(10, ""); //} //logSystem.log("UniKey created (" + settings.uniKeyMode + ") = " + output, logType.Execution); return(output); }
/// <summary> /// Izvršava imbBasic metod detekcije templejta /// </summary> /// <param name="source">Lista učitanih stranica</param> /// <param name="settings">Podešavanja</param> /// <returns>Sređena definicija templejta</returns> public static templateDefinition detectTemplate_imbBasic(crawledPage[] source, imbWebTemplateSettings settings) { templateDefinition output = new templateDefinition(); List <string> xPathList; // COMMON TREE DETECTION switch (settings.commonTreeDetection) { default: case commonTreeMethod.imbEndNodePathFrequency: output.xPathStruktura = templateOperations.commonTree_imbENPF(source, settings); break; } crawledPage c = source.First(); //imbNamespaceSetup nsSetup = new imbNamespaceSetup(c.xmlDocument); // COMMON CONTENT CHECK output.xPathStruktura = templateOperations.commonContentCheck(source, output.xPathStruktura, settings); templateExtensions.prepareContent(output, settings); // page track if (settings.doSavePageUrls) { foreach (crawledPage p in source) { output.relatedPages.Add(p.url); } } output.score = source.Length; if (output.xPathStruktura.Count == 0) { logSystem.log("Template detection failed!", logType.Notification); } return(output); }
/// <summary> /// imbBasic metodologija :: /// Primenjuje pravila vezana za sadržaj i ako treba formira commonContent za svaki od templateElement-a /// </summary> /// <param name="source">Lista sa stranicama koje treba da se analiziraju</param> /// <param name="xPathList">Postojeca xPathLista</param> /// <param name="settings">Podesavanja</param> /// <returns>Umanjenu listu (ako je tako podeseno) sa definisanim zajedničkim sadržajem</returns> public static Dictionary <string, templateElement> commonContentCheck(crawledPage[] source, Dictionary <string, templateElement> xPathList, imbWebTemplateSettings settings) { Dictionary <string, templateElement> output = new Dictionary <string, templateElement>(); //if (source.Length == 0) //{ // logSystem.log("No pages supplied for content check!", logType.Warning); // return output; //} //switch (settings.contentPolicy) //{ // case commonContentPolicy.ignoreContent: // return xPathList; // break; //} //foreach (KeyValuePair<String, templateElement> el in xPathList) //{ // String tmpContent = null; // String[] cmnContent = null; //= el.Value.content; // List<String[]> contents = new List<string[]>(); // Boolean add = true; // foreach (crawledPage p in source) // { // String textCon = ""; // XmlNode nd = imbAdvancedXPath.xPathExecution(el.Key, p.xmlDocument.DocumentElement, null, imbCore.xml.queryEngine.imbXPathQuery, true, 0); // textCon = textRetriveEngine.retriveText(nd, settings.textRetriveSetup); // switch (settings.contentPolicy) // { // default: // case commonContentPolicy.onlyExactContent: // if (tmpContent == null) // { // tmpContent = textCon; // } // if (tmpContent != textCon) // { // add = false; // break; // } // break; // case commonContentPolicy.extractCommonContent: // //String[] lines = imbNLPTools.defaultSplit(textCon, imbNLPengine.imbBasic, settings.contentTokenizationSettings); // contents.Add(lines); // break; // } // } // if (settings.contentPolicy == commonContentPolicy.extractCommonContent) // //{ // // List<String> commonStrings = imbNLPTools.getCommonMembers(contents, settings.contentExtractionTolerance); // // String commonContent = imbNLPTools.defaultJoin(commonStrings.ToArray(), imbNLPengine.imbBasic, settings.contentTokenizationSettings); // el.Value.content = commonContent; // if (String.IsNullOrEmpty(commonContent)) // { // add = false; // } // } // if (add) // { // output.Add(el.Key, el.Value); // } //} //logSystem.log(settings.contentPolicy + " :: Ulaz [" + xPathList.Count() + "] - na izlazu> [" + output.Count() + "]", logType.Execution); return(output); }