public spiderLinkFlags GetFlags(spiderLink linkVector) { spiderLinkFlags output = spiderLinkFlags.none; string key = GetHash(linkVector.url); if (!items.ContainsKey(key)) { output = spiderLinkFlags.newlinkTarget; output |= spiderLinkFlags.newlinkVector; } else { output = spiderLinkFlags.oldlinkTarget; string vkey = GetKeyForVector(linkVector); if (items[key].linkVectors.ContainsKey(vkey)) { output |= spiderLinkFlags.oldlinkVector; } else { output |= spiderLinkFlags.newlinkVector; } } return(output); }
/// <summary> /// Gets the link by origin /// </summary> /// <param name="linkVector">The link vector.</param> /// <returns></returns> public spiderTarget GetByOrigin(spiderLink linkVector) { string key = GetHash(linkVector.originPage.url); if (items.ContainsKey(key)) { return(items[key]); } else { return(null); } }
public spiderTarget GetByTarget(spiderLink linkVector) { string key = GetHash(linkVector.url); spiderTarget target = null; if (items.ContainsKey(key)) { target = items[key]; } else { } return(target); }
/// <summary> /// Adds the new vector to the target. <see cref="spiderLink.originPage"/> has to be specified otherwise exception will be thrown. Returns <c>true</c> if it is new vector for this target /// </summary> /// <param name="__vector">The vector.</param> /// <returns></returns> /// <exception cref="aceGeneralException">Supplied spiderLink vector has no origin specified - null - No origin page for link: " + __vector.url</exception> public bool AddVector(spiderLink __vector) { if (__vector.originPage != null) { string hk = getKey(__vector); if (linkVectors.ContainsKey(hk)) { return(false); } else { deploy(__vector, hk); return(true); } } else { throw new aceGeneralException("Supplied spiderLink vector has no origin specified", null, __vector, "No origin page for link: " + __vector.url); } return(false); }
/// <summary> /// Gets the or create target. /// </summary> /// <param name="linkVector">The link vector.</param> /// <returns></returns> public spiderTarget GetOrCreateTarget(spiderLink linkVector, bool autoAdd = true, bool processVector = true) { string key = GetHash(linkVector.url); spiderTarget target = null; if (items.ContainsKey(key)) { target = items[key]; if (processVector) { target.AddVector(linkVector); } } else { target = new spiderTarget(linkVector, this); if (autoAdd) { items.Add(key, target); } } return(target); }
protected void deploy(spiderLink __vector = null, string __vkey = null) { List <string> tkns = new List <string>(); if (url.isNullOrEmpty()) { url = __vector.url; iterationDiscovery = __vector.iterationDiscovery; targetHash = __vector.targetHash; } if (__vector != null) { if (__vkey == null) { __vkey = getKey(__vector); } linkVectors.Add(__vkey, __vector); string rUrl = parent.wRecord.domainInfo.GetURLWithoutDomainName(__vector.url); if (rUrl.Length > 0) { if (!__vector.domain.isNullOrEmpty()) { rUrl = rUrl.Replace(__vector.domain, ""); } var r = rUrl.getTokens(true, false, true, true, 1); if (semanticLexiconManager.lexiconCache != null) { r = semanticLexiconManager.lexiconCache.decodeTwins(r); } var c = new List <string>(); foreach (string caption in __vector.captions) { if (!caption.isNullOrEmpty()) { c.AddRange(caption.getTokens(true, false, false, true)); } } //__vector.captions.getTokens(true, false, true,true,2); tkns.AddRange(r); tkns.AddRange(c); } } else { tkns.AddRange(url.getTokens(true, true, true)); } if (tokens == null) { tokens = parent.dlTargetLinkTokens.AddTable("tkns_" + key) as termDocument; tokens.expansion = 0; tokens.AddTokens(tkns); } else { tokens.AddTokens(tkns); } }
protected string getKey(spiderLink __vector) { return(parent.GetKeyForVector(__vector)); }
public spiderTarget(spiderLink __vector, spiderTargetCollection __parent) { parent = __parent; deploy(__vector); }
/// <summary> /// Processes the link into Targets /// </summary> /// <param name="ln">The ln.</param> /// <param name="parentNode">The parent node.</param> /// <param name="doLinkResolver">if set to <c>true</c> [do link resolver].</param> /// <returns>If new target is created</returns> public bool processLink(link ln, spiderPage parentNode, bool doLinkResolver = true) { bool isNewLink = false; #region LINK NORMALIZATION ================================= if (doLinkResolver) { ln.url = ln.getAbsoluteUrl(parentNode.webpage); ln.url = ln.url.httpsToHttpShema(); ln.url = ln.url.equalizeUrlWithIndexFilenames(); ln.url = wRecord.domainInfo.GetResolvedUrl(ln.url, imbWEMManager.settings.linkResolver.LNK_RemoveAnchors); try { domainAnalysis da = new domainAnalysis(ln.url); if (ln.url.IndexOf(da.domainName) > -1) { int l = ln.url.Length - (ln.url.IndexOf(da.domainName) + da.domainName.Length); if (l == 1) { ln.url = da.urlProper; } } } catch (Exception ex) { imbWEMManager.log.log("Process link exception: " + ex.Message); } } #endregion ======================================================== spiderLink sln = new spiderLink(parentNode, ln, wRecord.iteration); // <------------------------------------------------------------ upisuje referencu porekla: stranica, link i iteracija if (!spider.approveUrl(sln.link)) { sln.flags |= spiderLinkFlags.urlNotSupported; // <---------------------------------------------------------------------- ako link nije poželjan / dozvoljen } else { spiderTarget target = targets.GetByTarget(sln); if (wRecord.web.webLinks.Add(sln)) { sln.flags |= spiderLinkFlags.newlinkVector; } else { sln.flags |= spiderLinkFlags.oldlinkVector; } if (wRecord.web.webTargets.Add(sln)) { sln.flags |= spiderLinkFlags.newlinkTarget; } else { sln.flags |= spiderLinkFlags.oldlinkTarget; } if (sln.flags.HasFlag(spiderLinkFlags.newlinkTarget) || (target == null)) { if (target == null) { isNewLink = true; target = targets.GetOrCreateTarget(sln, true, true); wRecord.web.webActiveLinks.Add(sln); } else { isNewLink = false; } // <----------------------------------------------------------------------- upisuje u spisak aktivnih linkova } } return(isNewLink); }
public string GetKeyForVector(spiderLink linkVector) { return(md5.GetMd5Hash(linkVector.originPage.url + linkVector.captions.toCsvInLine() + linkVector.urls)); }