Ejemplo n.º 1
0
        public int getLinkCountRotated(ISpiderTarget itemY, ISpiderTarget itemX)
        {
            int        output = 0;
            spiderPage page   = itemX.page as spiderPage;

            List <spiderLink> links = page.relationship.outflowLinks.GetTargetingTo(itemY.targetHash);

            return(output = links.Count);
            //wRecord.web
        }
Ejemplo n.º 2
0
        //public spiderLink(String url)
        //{
        //    link = new link(url);
        //    Uri tmp = new Uri(url);
        //    domain = tmp.Host;
        //    url = link.url.ToString();
        //    name = link.caption;
        //    description = "";
        //  //  originPage = __home;
        //   // iteration = __iteracija;

        //}



        public spiderLink(spiderPage __home, link __link, int __iteracija)
        {
            if (__home == null)
            {
                throw new aceGeneralException("Page of origin for this link never provided", null, this, "Bad arguments at constructor");
            }
            link               = __link;
            url                = link.url.ToString();
            originPage         = __home;
            iterationDiscovery = __iteracija;
            name               = link.caption;
            domain             = link.domain;
            captions.Add(link.caption);
            urls.AddInstance(url, "Link urls @ spiderLink");


            //description = "";
            //Uri tmp = new Uri(url);
            //domain = tmp.Host;
            //if (url.isNullOrEmpty())
            //{
            //    throw new aceGeneralException("url is null");
            //}
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Attaches the page - if the page was already attached returns <c>false</c>
        /// </summary>
        /// <param name="__page">The page.</param>
        /// <returns></returns>
        public bool AttachPage(spiderPage __page, ILogBuilder response, int targetBlockCount = 3)
        {
            if (page != __page)
            {
                page = __page;

                HtmlDocument htmlDoc = GetHtmlDocument();


                iterationLoaded = parent.wRecord.iteration;

                if (htmlDoc != null)
                {
                    XPathNavigator xnav = htmlDoc.DocumentNode.CreateNavigator();

                    pageText = xnav.retriveText(imbWEMManager.settings.contentProcessor.textRetrieve);

                    pageText = WebUtility.HtmlDecode(pageText);
                    pageHash = md5.GetMd5Hash(pageText);

                    if (parent.wRecord.tRecord.instance.settings.doEnableDLC_BlockTree)
                    {
                        contentTree   = htmlDoc.buildTree(page.webpage.domain); // contentTree = new nodeTree(page.webpage.domain, htmlDoc);
                        contentBlocks = contentTree.getBlocks(targetBlockCount);
                        contentBlocks.CalculateScores();
                    }


                    var ignoreTokens = parent.wRecord.domainInfo.domainWords;

                    var preprocessedTokens = parent.wRecord.tRecord.evaluator.GetAllProperTokensSortedByFrequency(pageText);


                    if (parent.wRecord.tRecord.instance.settings.doEnableDLC_TFIDF)
                    {
                        content           = parent.dlTargetPageTokens.AddTable(key) as termDocument;
                        content.expansion = parent.wRecord.tRecord.instance.settings.TermExpansionForContent;
                        content.AddTokens(preprocessedTokens.ToList(), response);
                    }



                    bool evaluationOk = false;

                    indexPageEvaluationEntryState pageState = indexPageEvaluationEntryState.haveNoEvaluationEntry;


                    if (imbWEMManager.settings.indexEngine.doIndexFullTrustMode)
                    {
                        pageState = imbWEMManager.index.pageIndexTable.GetPageAssertion(url);
                    }
                    else
                    {
                        pageState = indexPageEvaluationEntryState.notInTheIndex;
                    }

                    if (pageState.HasFlag(indexPageEvaluationEntryState.haveEvaluationEntry))
                    {
                        evaluation = new multiLanguageEvaluation();
                        evaluation.result_language = evaluatedLanguage;
                        evaluationOk      = pageState.HasFlag(indexPageEvaluationEntryState.isRelevant);
                        evaluatedLanguage = basicLanguageEnum.serbian;
                    }
                    else
                    {
                        evaluation        = parent.wRecord.tRecord.evaluator.evaluate(pageText, ignoreTokens, preprocessedTokens.ToList());
                        evaluatedLanguage = evaluation.result_language;
                    }



                    lock (RelevantPageLock)
                    {
                        if (IsRelevant)
                        {
                            parent.wRecord.context.targets.termSerbian.AddRange(preprocessedTokens);

                            parent.wRecord.relevantPages.AddUnique(__page.url);

                            parent.wRecord.tRecord.relevantPages.AddUnique(__page.url);
                        }

                        else
                        {
                            parent.wRecord.context.targets.termOther.AddRange(preprocessedTokens);
                        }

                        parent.wRecord.context.targets.termsAll.AddRange(preprocessedTokens);
                    }



                    // <----- calling event

                    //targs.htmlDoc = htmlDoc;
                    if (parent.wRecord.context.OnTargetPageAttached != null)
                    {
                        var targs = new modelSpiderSiteRecordEventArgs(this);

                        parent.wRecord.context.OnTargetPageAttached(parent.wRecord, targs);
                    }
                }

                return(true);
            }
            return(false);
        }
Ejemplo n.º 4
0
        /// <summary>
        /// Processes loader result
        /// </summary>
        /// <param name="stResult">The st result.</param>
        /// <param name="doLinkResolver">Performs LinkResolver component tasks over each harvested link</param>
        /// <param name="doLinkHarvest">Extract designated linkNature and linkScope from the content</param>
        /// <param name="nature">The nature of links to harvest - flags</param>
        /// <param name="scope">The scope of links to harvest - flags</param>
        /// <returns>Number of newly added links</returns>
        public int processLoaderResult(spiderTaskResult stResult, bool doLinkResolver = true, bool doLinkHarvest = true, linkNature nature = linkNature.navigation, linkScope scope = linkScope.inner)
        {
            int nw_failed_l = 0;

            foreach (spiderTaskResultItem cresult in stResult) // <--------------------------------------------------------------------------------- prolazi kroz sve učitane stranice
            {
                //cresult.page;
                spiderPage pg = cresult.sPage; //new spiderPage(cresult.page, wRecord.iteration); // <--------------------------------------------------------------- instancira spiderPage
                modelWebPageGeneralRecord pGeneralRecord = null;

                if (cresult.status != pageStatus.failed)
                {
                    web.webPageContentHashList.AddInstance(pg.contentHash, 1);

                    if (web.webPageContentHashList[pg.contentHash] > 1)
                    {
                        if (imbWEMManager.settings.executionLog.doPageErrorOrDuplicateLog)
                        {
                            aceLog.log("Page [" + pg.url + "] - is content duplicate ");
                        }
                        wRecord.listOfDuplicatedPages.Add(new contentHashAndAddressEntry(pg.url, pg.contentHash, web.webPageContentHashList[pg.contentHash]));
                        wRecord.duplicateCount++;

                        var t = targets.GetByTarget(cresult.target);
                        if (t != null)
                        {
                            t.isDuplicate = true;
                        }

                        continue;
                    }


                    // <-------------------------------------------------- instancira pGeneralRecord
                    // pGeneralRecord = wRecord.wGeneralRecord.children.GetRecord(pg.webpage, true);
                }

                cresult.target.targetedPage = pg;  // <-------------------------------------------------------------------------------------------- upisuje u link referencu stranice

                if (!wRecord.web.webPages.Add(pg)) // <--------------------------------------------------------------------------------------------- registruje stranicu u webPages skup
                {
                    wRecord.logBuilder.log("Web page [" + pg.url + "] was loaded before - check the algorithm");
                }

                if (cresult.status != pageStatus.failed)
                {
                    if (doLinkHarvest)
                    {
                        List <link> links  = cresult.page.links.Where <link>(x => (x.nature.HasFlag(nature) && x.scope.HasFlag(scope))).ToList(); // <---------------- izdvaja linkove sa stranice
                        int         length = links.Count;

                        for (int i = 0; i < length; i++)// <------------------------------------------------------------------------------------------------- iteracija kroz linkove
                        {
                            processLink(links[i] as link, cresult.sPage);
                        }
                    }
                }
                else
                {
                    nw_failed_l++;
                }



                cresult.dispose();
            }

            if (OnLoaderTaskProcessed != null)
            {
                OnLoaderTaskProcessed(wRecord, new modelSpiderSiteRecordEventArgs(stResult));
            }

            return(nw_failed_l);
        }
Ejemplo n.º 5
0
        /// <summary>
        /// Processes the link into Targets
        /// </summary>
        /// <param name="ln">The ln.</param>
        /// <param name="parentNode">The parent node.</param>
        /// <param name="doLinkResolver">if set to <c>true</c> [do link resolver].</param>
        /// <returns>If new target is created</returns>
        public bool processLink(link ln, spiderPage parentNode, bool doLinkResolver = true)
        {
            bool isNewLink = false;

            #region LINK NORMALIZATION =================================
            if (doLinkResolver)
            {
                ln.url = ln.getAbsoluteUrl(parentNode.webpage);
                ln.url = ln.url.httpsToHttpShema();
                ln.url = ln.url.equalizeUrlWithIndexFilenames();
                ln.url = wRecord.domainInfo.GetResolvedUrl(ln.url, imbWEMManager.settings.linkResolver.LNK_RemoveAnchors);

                try
                {
                    domainAnalysis da = new domainAnalysis(ln.url);

                    if (ln.url.IndexOf(da.domainName) > -1)
                    {
                        int l = ln.url.Length - (ln.url.IndexOf(da.domainName) + da.domainName.Length);
                        if (l == 1)
                        {
                            ln.url = da.urlProper;
                        }
                    }
                } catch (Exception ex)
                {
                    imbWEMManager.log.log("Process link exception: " + ex.Message);
                }
            }
            #endregion ========================================================


            spiderLink sln = new spiderLink(parentNode, ln, wRecord.iteration); // <------------------------------------------------------------ upisuje referencu porekla: stranica, link i iteracija

            if (!spider.approveUrl(sln.link))
            {
                sln.flags |= spiderLinkFlags.urlNotSupported; // <---------------------------------------------------------------------- ako link nije poželjan / dozvoljen
            }
            else
            {
                spiderTarget target = targets.GetByTarget(sln);


                if (wRecord.web.webLinks.Add(sln))
                {
                    sln.flags |= spiderLinkFlags.newlinkVector;
                }
                else
                {
                    sln.flags |= spiderLinkFlags.oldlinkVector;
                }


                if (wRecord.web.webTargets.Add(sln))
                {
                    sln.flags |= spiderLinkFlags.newlinkTarget;
                }
                else
                {
                    sln.flags |= spiderLinkFlags.oldlinkTarget;
                }

                if (sln.flags.HasFlag(spiderLinkFlags.newlinkTarget) || (target == null))
                {
                    if (target == null)
                    {
                        isNewLink = true;
                        target    = targets.GetOrCreateTarget(sln, true, true);
                        wRecord.web.webActiveLinks.Add(sln);
                    }
                    else
                    {
                        isNewLink = false;
                    }
                    // <----------------------------------------------------------------------- upisuje u spisak aktivnih linkova
                }
            }

            return(isNewLink);
        }