コード例 #1
0
        public spiderLinkFlags GetFlags(spiderLink linkVector)
        {
            spiderLinkFlags output = spiderLinkFlags.none;
            string          key    = GetHash(linkVector.url);

            if (!items.ContainsKey(key))
            {
                output = spiderLinkFlags.newlinkTarget;

                output |= spiderLinkFlags.newlinkVector;
            }
            else
            {
                output = spiderLinkFlags.oldlinkTarget;

                string vkey = GetKeyForVector(linkVector);
                if (items[key].linkVectors.ContainsKey(vkey))
                {
                    output |= spiderLinkFlags.oldlinkVector;
                }
                else
                {
                    output |= spiderLinkFlags.newlinkVector;
                }
            }

            return(output);
        }
コード例 #2
0
        /// <summary>
        /// Gets the link by origin
        /// </summary>
        /// <param name="linkVector">The link vector.</param>
        /// <returns></returns>
        public spiderTarget GetByOrigin(spiderLink linkVector)
        {
            string key = GetHash(linkVector.originPage.url);

            if (items.ContainsKey(key))
            {
                return(items[key]);
            }
            else
            {
                return(null);
            }
        }
コード例 #3
0
        public spiderTarget GetByTarget(spiderLink linkVector)
        {
            string       key    = GetHash(linkVector.url);
            spiderTarget target = null;

            if (items.ContainsKey(key))
            {
                target = items[key];
            }
            else
            {
            }
            return(target);
        }
コード例 #4
0
ファイル: spiderTarget.cs プロジェクト: gorangrubic/imbWEM
        /// <summary>
        /// Adds the new vector to the target. <see cref="spiderLink.originPage"/> has to be specified otherwise exception will be thrown. Returns <c>true</c> if it is new vector for this target
        /// </summary>
        /// <param name="__vector">The vector.</param>
        /// <returns></returns>
        /// <exception cref="aceGeneralException">Supplied spiderLink vector has no origin specified - null - No origin page for link: " + __vector.url</exception>
        public bool AddVector(spiderLink __vector)
        {
            if (__vector.originPage != null)
            {
                string hk = getKey(__vector);

                if (linkVectors.ContainsKey(hk))
                {
                    return(false);
                }
                else
                {
                    deploy(__vector, hk);
                    return(true);
                }
            }
            else
            {
                throw new aceGeneralException("Supplied spiderLink vector has no origin specified", null, __vector, "No origin page for link: " + __vector.url);
            }

            return(false);
        }
コード例 #5
0
        /// <summary>
        /// Gets the or create target.
        /// </summary>
        /// <param name="linkVector">The link vector.</param>
        /// <returns></returns>
        public spiderTarget GetOrCreateTarget(spiderLink linkVector, bool autoAdd = true, bool processVector = true)
        {
            string       key    = GetHash(linkVector.url);
            spiderTarget target = null;

            if (items.ContainsKey(key))
            {
                target = items[key];
                if (processVector)
                {
                    target.AddVector(linkVector);
                }
            }
            else
            {
                target = new spiderTarget(linkVector, this);
                if (autoAdd)
                {
                    items.Add(key, target);
                }
            }

            return(target);
        }
コード例 #6
0
ファイル: spiderTarget.cs プロジェクト: gorangrubic/imbWEM
        protected void deploy(spiderLink __vector = null, string __vkey = null)
        {
            List <string> tkns = new List <string>();

            if (url.isNullOrEmpty())
            {
                url = __vector.url;
                iterationDiscovery = __vector.iterationDiscovery;
                targetHash         = __vector.targetHash;
            }


            if (__vector != null)
            {
                if (__vkey == null)
                {
                    __vkey = getKey(__vector);
                }

                linkVectors.Add(__vkey, __vector);



                string rUrl = parent.wRecord.domainInfo.GetURLWithoutDomainName(__vector.url);

                if (rUrl.Length > 0)
                {
                    if (!__vector.domain.isNullOrEmpty())
                    {
                        rUrl = rUrl.Replace(__vector.domain, "");
                    }


                    var r = rUrl.getTokens(true, false, true, true, 1);

                    if (semanticLexiconManager.lexiconCache != null)
                    {
                        r = semanticLexiconManager.lexiconCache.decodeTwins(r);
                    }

                    var c = new List <string>();

                    foreach (string caption in __vector.captions)
                    {
                        if (!caption.isNullOrEmpty())
                        {
                            c.AddRange(caption.getTokens(true, false, false, true));
                        }
                    }

                    //__vector.captions.getTokens(true, false, true,true,2);



                    tkns.AddRange(r);

                    tkns.AddRange(c);
                }
            }
            else
            {
                tkns.AddRange(url.getTokens(true, true, true));
            }

            if (tokens == null)
            {
                tokens           = parent.dlTargetLinkTokens.AddTable("tkns_" + key) as termDocument;
                tokens.expansion = 0;
                tokens.AddTokens(tkns);
            }
            else
            {
                tokens.AddTokens(tkns);
            }
        }
コード例 #7
0
ファイル: spiderTarget.cs プロジェクト: gorangrubic/imbWEM
 protected string getKey(spiderLink __vector)
 {
     return(parent.GetKeyForVector(__vector));
 }
コード例 #8
0
ファイル: spiderTarget.cs プロジェクト: gorangrubic/imbWEM
 public spiderTarget(spiderLink __vector, spiderTargetCollection __parent)
 {
     parent = __parent;
     deploy(__vector);
 }
コード例 #9
0
ファイル: spiderDLContext.cs プロジェクト: gorangrubic/imbWEM
        /// <summary>
        /// Processes the link into Targets
        /// </summary>
        /// <param name="ln">The ln.</param>
        /// <param name="parentNode">The parent node.</param>
        /// <param name="doLinkResolver">if set to <c>true</c> [do link resolver].</param>
        /// <returns>If new target is created</returns>
        public bool processLink(link ln, spiderPage parentNode, bool doLinkResolver = true)
        {
            bool isNewLink = false;

            #region LINK NORMALIZATION =================================
            if (doLinkResolver)
            {
                ln.url = ln.getAbsoluteUrl(parentNode.webpage);
                ln.url = ln.url.httpsToHttpShema();
                ln.url = ln.url.equalizeUrlWithIndexFilenames();
                ln.url = wRecord.domainInfo.GetResolvedUrl(ln.url, imbWEMManager.settings.linkResolver.LNK_RemoveAnchors);

                try
                {
                    domainAnalysis da = new domainAnalysis(ln.url);

                    if (ln.url.IndexOf(da.domainName) > -1)
                    {
                        int l = ln.url.Length - (ln.url.IndexOf(da.domainName) + da.domainName.Length);
                        if (l == 1)
                        {
                            ln.url = da.urlProper;
                        }
                    }
                } catch (Exception ex)
                {
                    imbWEMManager.log.log("Process link exception: " + ex.Message);
                }
            }
            #endregion ========================================================


            spiderLink sln = new spiderLink(parentNode, ln, wRecord.iteration); // <------------------------------------------------------------ upisuje referencu porekla: stranica, link i iteracija

            if (!spider.approveUrl(sln.link))
            {
                sln.flags |= spiderLinkFlags.urlNotSupported; // <---------------------------------------------------------------------- ako link nije poželjan / dozvoljen
            }
            else
            {
                spiderTarget target = targets.GetByTarget(sln);


                if (wRecord.web.webLinks.Add(sln))
                {
                    sln.flags |= spiderLinkFlags.newlinkVector;
                }
                else
                {
                    sln.flags |= spiderLinkFlags.oldlinkVector;
                }


                if (wRecord.web.webTargets.Add(sln))
                {
                    sln.flags |= spiderLinkFlags.newlinkTarget;
                }
                else
                {
                    sln.flags |= spiderLinkFlags.oldlinkTarget;
                }

                if (sln.flags.HasFlag(spiderLinkFlags.newlinkTarget) || (target == null))
                {
                    if (target == null)
                    {
                        isNewLink = true;
                        target    = targets.GetOrCreateTarget(sln, true, true);
                        wRecord.web.webActiveLinks.Add(sln);
                    }
                    else
                    {
                        isNewLink = false;
                    }
                    // <----------------------------------------------------------------------- upisuje u spisak aktivnih linkova
                }
            }

            return(isNewLink);
        }
コード例 #10
0
 public string GetKeyForVector(spiderLink linkVector)
 {
     return(md5.GetMd5Hash(linkVector.originPage.url + linkVector.captions.toCsvInLine() + linkVector.urls));
 }