//public String SetPageTFIDF(webPageTF wpTF, indexDomain idomain) //{ // String hash = idomain.HashCode + "-" + wpTF.ipage.HashCode; // var dt = wpTF.GetDataTable(wpTF.ipage.url); // String path = imbWEMManager.index.folder.pathFor(hash + ".xml"); // objectSerialization.saveObjectToXML<webPageTF>(path, wpTF); // dt.Save(imbWEMManager.index.experimentEntry.crawlRecordFolder, imbWEMManager.authorNotation, hash); // wpTF.ipage.TFIDFcompiled = true; // wpTF.ipage.Lemmas = wpTF.Count(); // AddOrUpdate(wpTF.ipage); // dt.GetReportAndSave(imbWEMManager.index.experimentEntry.crawlRecordFolder, imbWEMManager.authorNotation, hash, true); // return path; //} /// <summary> /// Returns all pages for the domain specified /// </summary> /// <param name="domainName">Name of the domain.</param> /// <returns></returns> public List <indexPage> GetPagesForDomain(string domainName) { domainAnalysis da = new domainAnalysis(domainName); var rows = tableSelect("domain = '" + da.domainName + "'"); var pages = GetObjectFromRows(rows); List <indexPage> output = new List <indexPage>(); List <string> urls = new List <string>(); foreach (indexPage page in pages) { if (urls.Contains(page.url)) { } else { output.Add(page); urls.Add(page.url); } } return(output); }
public modelSpiderSiteRecord(string __testRunStamp, spiderWeb __instance) : base(__testRunStamp, __instance) { //logBuilder.isEnabled = imbWEMManager.settings.executionLog.doKeepSiteRec; domainInfo = new domainAnalysis(instance.seedLink.url); iterationTableRecord = new objectTable <iterationPerformanceRecord>("key", "iteration_" + domainInfo.domainName.Replace(".", "_")); //moduleRecords.start() // domainInfo = new domainAnalysis(__instance.seedLink.url); // stats.Add(modelSpiderSideFields.mss_totalcrosslinks, 0, "Total crosslinks", "Total number of crosslinks detected among pages"); //stats.Add(modelSpiderSiteTimelineEnum.tl_iteration, 0, "Iteration", "Final iteration count"); //stats.Add(modelSpiderSiteTimelineEnum.tl_pagesloaded, 0, "Pages loaded", "Total count of pages loaded"); //stats.Add(modelSpiderSiteTimelineEnum.tl_pagesdetected, 0, "Pages detected", "Total count of pages loaded"); //stats.Add(modelSpiderSiteTimelineEnum.tl_pagesaccepted, 0, "Pages accepted", "Total count of pages loaded"); //stats.Add(modelSpiderSiteTimelineEnum.tl_totallinks, 0, "Links detected", "Total links processed"); //stats.Add(modelSpiderSiteTimelineEnum.tl_linksaccepted, 0, "Links accepted", "Total links processed"); //stats.Add(modelSpiderSiteTimelineEnum.tl_activelinks, 0, "Ative links", "Active links left at the end of procedure"); //stats.Add(modelSpiderSiteTimelineEnum.tl_stability, 0, "Pages loaded", "Total links processed"); }
public webSiteProfile(String _url) { domainInfo = new domainAnalysis(_url); url = domainInfo.urlProper; domain = domainInfo.domainName; name = domainInfo.domainRootName; }
public int Add(List <indexDomain> __indexDomains, int start = 0, int take = -1) { int c = 0; int i = 0; if (take == -1) { take = __indexDomains.Count - start; } foreach (indexDomain domain in __indexDomains) { if ((i >= start) && (i < (start + take))) { domainAnalysis da = new domainAnalysis(domain.url); if (Add(da.urlProper)) { indexDomains.Add(domain); c++; } } i++; } return(c); }
internal void deploy(imbMCWebSiteEntry __entry) { entry = __entry; if (domainInfo == null) { domainInfo = new domainAnalysis(entry.domainProperUrl); } name = domainInfo.domainRootName; if (domainInfo.domainRootName.isNullOrEmpty()) { } }
public static String GetCleanCaseName(String properUrl) { String output = properUrl; if (output.ContainsAny(new String[] { ".", "/", ":" })) { domainAnalysis da = new domainAnalysis(properUrl); return(da.domainName.Replace(".", "_")); } else { return(output); } }
/// <summary> /// Customized code to be executed once the console is started /// </summary> public override void onStartUp() { base.onStartUp(); dataBaseTarget dBT = new dataBaseTarget(); imbACE.Network.tools.systemKnowledge.prepare(dBT, output); domainAnalysis da = new domainAnalysis("http://www.koplas.co.rs"); // put here your code }
/// <summary> /// Loads the domain list. /// </summary> /// <param name="path">The path.</param> /// <param name="options">The options.</param> public void LoadDomainList(String path, WebDomainCategoryFormatOptions options = WebDomainCategoryFormatOptions.saveReadmeFile | WebDomainCategoryFormatOptions.saveAggregate | WebDomainCategoryFormatOptions.normalizeDomainname, ILogBuilder logger = null) { if (File.Exists(path)) { sites.Clear(); String[] list = File.ReadAllLines(path); foreach (String ln in list) { String s = ln; if (options.HasFlag(WebDomainCategoryFormatOptions.normalizeDomainname)) { domainAnalysis da = new domainAnalysis(s); s = da.urlProper; } sites.Add(s); } } }
/// <summary> /// Gets the domain list. /// </summary> /// <param name="options">The options.</param> /// <returns></returns> public String GetDomainList(WebDomainCategoryFormatOptions options, ILogBuilder logger = null) { StringBuilder sb = new StringBuilder(); foreach (String s in sites) { String ln = s; if (options.HasFlag(WebDomainCategoryFormatOptions.normalizeDomainname)) { domainAnalysis da = new domainAnalysis(s); ln = da.urlProper; } sb.AppendLine(ln); } return(sb.ToString()); }
/// <summary> /// Preuzima opisne podatke iz node/crawledpage objekta -- poziva se iz tokenizatora, ne prenosi sam asdrza /// </summary> /// <param name="sourcePage"></param> public void acceptSourcePage(node sourcePage) { if (sourcePage != null) { contentUrl = sourcePage.url; contentTitle = sourcePage.caption.htmlContentProcess(); if (!domainInfo.isNullOrEmptyString()) { domainInfo = new domainAnalysis(contentUrl); foreach (string word in domainInfo.domainWords) { contentToken newToken = new contentToken(); newToken.content = word; newToken.sourceContent = domainInfo.domainRootName; newToken.origin = contentTokenOrigin.domain; headTokens.Add(newToken); } List <string> words = new List <string>(); MatchCollection mchs = imbStringSelect._select_wordsFromDomainname.Matches(contentTitle); foreach (Match mch in mchs) { words.Add(mch.Value.ToLower()); } foreach (string word in words) { contentToken newToken = new contentToken(); newToken.content = word; newToken.sourceContent = contentTitle; newToken.origin = contentTokenOrigin.title; headTokens.Add(newToken); } } } }
/// <summary> /// Builds or updates web site repositorium using crawling information. /// </summary> /// <param name="targetCollection">Collection of SpiderTargets, populated by DLC crawl</param> /// <param name="domainInfo">DLC domain information</param> /// <param name="output">The output.</param> /// <returns> /// Reference to created or updated web site repository /// </returns> /// <remarks> /// This method uses completed DLC information to create <see cref="imbMCWebSite" /> repository and <see cref="imbMCWebPage" /> for all proper targets /// </remarks> public imbMCWebSite BuildWebSite(ISpiderTargetCollection targetCollection, domainAnalysis domainInfo, ILogBuilder output = null) { //Int32 siteCount = siteTable.Count; int pageCount = 0; imbMCWebSite repo = GetWebSite(domainInfo, true, output); pageCount = repo.pageTable.Count; if (pageCount == 0) { loger.log("Web site repository created [" + domainInfo.domainName + "]"); } List <ISpiderTarget> crawledTargets = targetCollection.GetLoaded(); foreach (ISpiderTarget target in crawledTargets) { if (isTargetProper(target)) { BuildWebPage(target, repo); } } int nPageCount = repo.pageTable.Count - pageCount; if (nPageCount > 0) { loger.log("Repository [" + domainInfo.domainName + "] expanded for [" + nPageCount + "] new pages, in total [" + (pageCount + nPageCount) + "] pages."); } siteTable.AddOrUpdate(repo.entry); repo.SaveDataStructure(folder, output); return(repo); }
/// <summary> /// Gets web site repositorium by clean domain name, like: "koplas.co.rs" for http://www.koplas.co.rs /// </summary> /// <param name="domainInfo">The domain information.</param> /// <param name="autoCreate">if set to <c>true</c> it will automatically create new entry and new repository</param> /// <param name="output">The log/diagnostic output</param> /// <returns></returns> public imbMCWebSite GetWebSite(domainAnalysis domainInfo, bool autoCreate = false, ILogBuilder output = null) { if (!autoCreate && (!siteTable.ContainsKey(domainInfo.domainRootName))) { return(null); } if (output == null) { output = aceLog.loger; } imbMCWebSiteEntry entry = siteTable.GetOrCreate(domainInfo.domainRootName); entry.domainProperUrl = domainInfo.urlProper; imbMCWebSite repo = entry.domain.LoadDataStructure <imbMCWebSite>(folder, output); repo.domainInfo = domainInfo; repo.deploy(entry); entry.domainProperUrl = repo.domainInfo.urlProper; siteTable.AddOrUpdate(entry); if (repo.folder == null) { if (output != null) { output.log("Warning: folder instance is null in web site repo [" + repo.name + "]"); } } return(repo); }
/// <summary> /// Adds any new domains from the source /// </summary> /// <param name="source">The source.</param> /// <returns></returns> public int Add(webSiteSimpleSample source, int start = 0, int take = -1) { int c = 0; int i = 0; if (take == -1) { take = source.Count - start; } foreach (string domain in source.domains) { if ((i >= start) && (i < (start + take))) { domainAnalysis da = new domainAnalysis(domain); if (Add(da.urlProper)) { c++; } } i++; } return(c); }
public void StartApp() { ILogBuilder logger = new builderForLogBase(); app = new testApplication(); Thread t = new Thread(newThread); t.Start(); Thread.Sleep(2000); dataBaseTarget dBT = new dataBaseTarget(); imbACE.Network.tools.systemKnowledge.prepare(dBT, logger); domainAnalysis da = new domainAnalysis("http://www.koplas.co.rs"); Assert.IsNotNull(da.tldDefinition); }
/// <summary> /// Load index information from the external data table source /// </summary> /// <param name="operation">The operation.</param> /// <param name="sourceFile">The source file.</param> /// <param name="loger">The loger.</param> public void ExecuteIndexPageOperation(indexPageTableOperation operation, string sourceFile, List <string> domainsString = null, ILogBuilder loger = null) { List <indexPage> pages = new List <indexPage>(); int i = 0; int c = 0; int imax = pages.Count / 20; List <indexDomain> domains = new List <indexDomain>(); List <string> dList = new List <string>(); if (domainsString != null) { foreach (string dm in domainsString) { domainAnalysis da = new domainAnalysis(dm); dList.Add(da.domainName); } } domainsString = dList; switch (operation) { default: break; case indexPageTableOperation.flushNotInSample: pages = pageIndexTable.GetList(); imax = domainsString.Count / 10; // domains = domainIndexTable.GetDomains(indexDomainContentEnum.any); List <indexPage> newPages = new List <indexPage>(); foreach (string dm in domainsString) { newPages.AddRange((IEnumerable <indexPage>)pageIndexTable.GetPagesForDomain(dm)); if (i > imax) { i = 0; loger.log("Selecting pages [" + ((double)c / (double)domainsString.Count).ToString("P2") + "]"); } i++; c++; } pageIndexTable.Clear(); Save(); imax = newPages.Count / 20; foreach (indexPage page in newPages) { pageIndexTable.Add(page); if (i > imax) { i = 0; loger.log("Adding pages [" + ((double)c / (double)newPages.Count).ToString("P2") + "]"); } i++; c++; } domainIndexTable.Clear(); Save(); Recheck(loger); Publish(new aceAuthorNotation()); break; case indexPageTableOperation.flushNotLoaded: pages = pageIndexTable.GetList(); //domains = domainIndexTable.GetDomains(indexDomainContentEnum.any); foreach (indexPage page in pages) { if (page.byteSize == 0) { pageIndexTable.Remove(page); } if (i > imax) { i = 0; loger.log("Removing pages from index [" + ((double)c / (double)pages.Count).ToString("P2") + "]"); } i++; c++; } break; case indexPageTableOperation.loadReviewedTable: ApplyManualPageIndex(sourceFile, loger, true); break; } }
/// <summary> /// Closes the session. /// </summary> public void CloseSession(IEnumerable <modelSpiderTestRecord> tRecords) { if (imbWEMManager.settings.indexEngine.doSaveFailedURLQueries) { pageIndexTable.ReadOnlyMode = false; int i = 0; int c = 0; int ic = pageIndexTable.urlsNotInIndex.Count; int ib = ic / 10; aceLog.log("Deploying queried URLs that were not in the index (" + ic.ToString() + ")"); foreach (string url in pageIndexTable.urlsNotInIndex) { i++; c++; indexPage page = pageIndexTable.GetPageForUrl(url); page.url = url; domainAnalysis da = new domainAnalysis(url); page.domain = da.domainName; pageIndexTable.AddOrUpdate(page); if (i >= ib) { aceLog.log("URL processed: " + c.GetRatio(ic).ToString("P2") + " (" + c + ")"); i = 0; } } } if (indexSessionEntry != null) { aceLog.log("Saving index engine performance : ... "); if (!SKIP_INDEXUPDATE) { var das = imbWEMManager.index.domainIndexTable.GetDomainIndexAssertion(null, true); aceLog.log("Saving index engine performance : DomainAssetion done "); indexSessionEntry.Domains = domainIndexTable.Count; indexSessionEntry.Pages = pageIndexTable.Count; indexSessionEntry.PagesEvaluated = pageIndexTable.Where(x => !collectionExtensions.isNullOrEmpty(x.relevancyText)).Count(); indexSessionEntry.CrawlerHash = experimentManager.CurrentSession.state.setupHash_crawler; indexSessionEntry.GlobalSetupHash = experimentManager.CurrentSession.state.setupHash_global; indexSessionEntry.Duration = DateTime.Now.Subtract(indexSessionEntry.Start).TotalMinutes; aceLog.log("Saving index engine performance : PagesEvaluated counted "); indexSessionEntry.CertainityPP = das.certainty; indexSessionEntry.MasterTFIDFCoverage = das.masterTFIDFApplied; indexSessionEntry.DomainTFIDFs = das[indexDomainContentEnum.completeDomainTFIDF].Count; } aceLog.log("Saving index engine performance : Saving index "); if (imbWEMManager.settings.directReportEngine.doPublishIndexPerformanceTable) { indexSessionRecords.AddOrUpdate(indexSessionEntry); } // experimentManager.globalTFIDFSet.GetAggregateDataTable().saveObjectToXML(folder.pathFor(experimentSessionRegistry.PATH_CompiledFTIDF)); } //Publish(imbWEMManager.authorNotation, null); //Publish(imbWEMManager.authorNotation, experimentManager.CurrentSession.sessionReportFolder); experimentManager.CloseSession(tRecords); }
/// <summary> /// Imports sample from text file /// </summary> /// <param name="path">path to file with samples, if * it will open dialog to select the file</param> /// <param name="inWorkspace">if true, the file path is interpreted as relative to console workspace</param> /// <param name="sampleName">Name of the sample list, if empty it will not change current sample list name</param> /// <param name="replace">if set to true it will replace any existing samples in the list</param> /// <param name="debug">if true it will report on link preprocessing</param> /// <remarks> /// Loads the file and adds domain urls from it into context's sample list /// </remarks> /// <seealso cref="aceOperationSetExecutorBase" /> public void aceOperation_addSampleFile( [Description("path to file with samples, if * it will open dialog to select the file")] String path = "*", [Description("if true, the file path is interpreted as relative to console workspace")] Boolean inWorkspace = true, [Description("Name of the sample list, if empty it will not change current sample list name")] String sampleName = "", [Description("if set to true it will replace any existing samples in the list")] Boolean replace = false, [Description("Number of entries to skip, from the imported file")] Int32 skip = 0, [Description("If set above 0, it limits the total number of domains imported")] Int32 limit = -1, [Description("if true it will report on link preprocessing")] Boolean debug = true) { IAceAdvancedConsole console = parent as IAceAdvancedConsole; if (path == "*") { String defPath = appManager.Application.folder_projects.path; if (inWorkspace) { if (console != null) { defPath = console.workspace.folder.path; } } path = dialogs.openSelectFile(imbACE.Services.textBlocks.smart.dialogSelectFileMode.selectFileToOpen, "*.txt", defPath, "Select file to import web domains sample from"); inWorkspace = false; } if (Path.IsPathRooted(path)) { inWorkspace = false; } if (inWorkspace) { if (console != null) { path = console.workspace.folder.pathFor(path); } } if (limit == -1) { limit = 10000; } if (skip < 0) { skip = 0; } if (File.Exists(path)) { if (replace) { context.sampleList = new webSiteSimpleSample(); } if (!sampleName.isNullOrEmpty()) { context.sampleList.name = sampleName; } var list = path.openFileToList(true); Int32 c = 0; foreach (String l in list) { domainAnalysis da = new domainAnalysis(l); if (c < skip) { if (debug) { output.Append(String.Format("Skipping {0,-20} => {1,-20}", l, da.urlProper)); } } else { if (c >= limit) { break; } if (debug) { output.Append(String.Format("Adding {0,-20} => {1,-20}", l, da.urlProper)); } context.sampleList.Add(da.urlProper); } } } else { output.log("Sample list file not found at [" + path + "]"); } }
/// <summary> /// Processes the link into Targets /// </summary> /// <param name="ln">The ln.</param> /// <param name="parentNode">The parent node.</param> /// <param name="doLinkResolver">if set to <c>true</c> [do link resolver].</param> /// <returns>If new target is created</returns> public bool processLink(link ln, spiderPage parentNode, bool doLinkResolver = true) { bool isNewLink = false; #region LINK NORMALIZATION ================================= if (doLinkResolver) { ln.url = ln.getAbsoluteUrl(parentNode.webpage); ln.url = ln.url.httpsToHttpShema(); ln.url = ln.url.equalizeUrlWithIndexFilenames(); ln.url = wRecord.domainInfo.GetResolvedUrl(ln.url, imbWEMManager.settings.linkResolver.LNK_RemoveAnchors); try { domainAnalysis da = new domainAnalysis(ln.url); if (ln.url.IndexOf(da.domainName) > -1) { int l = ln.url.Length - (ln.url.IndexOf(da.domainName) + da.domainName.Length); if (l == 1) { ln.url = da.urlProper; } } } catch (Exception ex) { imbWEMManager.log.log("Process link exception: " + ex.Message); } } #endregion ======================================================== spiderLink sln = new spiderLink(parentNode, ln, wRecord.iteration); // <------------------------------------------------------------ upisuje referencu porekla: stranica, link i iteracija if (!spider.approveUrl(sln.link)) { sln.flags |= spiderLinkFlags.urlNotSupported; // <---------------------------------------------------------------------- ako link nije poželjan / dozvoljen } else { spiderTarget target = targets.GetByTarget(sln); if (wRecord.web.webLinks.Add(sln)) { sln.flags |= spiderLinkFlags.newlinkVector; } else { sln.flags |= spiderLinkFlags.oldlinkVector; } if (wRecord.web.webTargets.Add(sln)) { sln.flags |= spiderLinkFlags.newlinkTarget; } else { sln.flags |= spiderLinkFlags.oldlinkTarget; } if (sln.flags.HasFlag(spiderLinkFlags.newlinkTarget) || (target == null)) { if (target == null) { isNewLink = true; target = targets.GetOrCreateTarget(sln, true, true); wRecord.web.webActiveLinks.Add(sln); } else { isNewLink = false; } // <----------------------------------------------------------------------- upisuje u spisak aktivnih linkova } } return(isNewLink); }