Exemplo n.º 1
0
 public MainPage(HtmlManager connectionService)
 {
     ConnectionService = connectionService;
     InitializeComponent();
 }
Exemplo n.º 2
0
        public static List <SeedMagnetSearchModel> SearchJavBus(string avId, CookieContainer cc = null)
        {
            List <SeedMagnetSearchModel> ret = new List <SeedMagnetSearchModel>();

            var refere = "https://www.javbus.com/" + avId;

            var html = HtmlManager.GetHtmlContentViaUrl(refere, "utf-8", false, cc);

            if (html.Success)
            {
                var gidPattern = "var gid = (.*?);";
                var ucPattern  = "var uc = (.*?);";
                var picPattern = "var img = '(.*?)';";

                var gidMatch = Regex.Match(html.Content, gidPattern);
                var ucMatch  = Regex.Match(html.Content, ucPattern);
                var picMatch = Regex.Match(html.Content, picPattern);

                var gid = gidMatch.Groups[1].Value;
                var uc  = ucMatch.Groups[1].Value;
                var pic = picMatch.Groups[1].Value;

                var url = $"https://www.javbus.com/ajax/uncledatoolsbyajax.php?gid={gid}&lang=zh&img={pic}&uc={uc}&floor=922";

                var magHtml = HtmlManager.GetHtmlWebClient(url, null, "javbus.com", "", refere);

                if (magHtml.Success)
                {
                    HtmlDocument htmlDocument = new HtmlDocument();
                    htmlDocument.LoadHtml(magHtml.Content);

                    var magPattern = "//tr[@style=' border-top:#DDDDDD solid 1px']";

                    HtmlNodeCollection nodes = htmlDocument.DocumentNode.SelectNodes(magPattern);

                    if (nodes != null)
                    {
                        foreach (var node in nodes)
                        {
                            var namePart = "";
                            var sizePart = "";
                            var datePart = "";
                            var magUrl   = "";
                            var size     = 0d;

                            try
                            {
                                if (node != null)
                                {
                                    if (node.ChildNodes.Count >= 2)
                                    {
                                        namePart = node.ChildNodes[1].InnerText.Trim();
                                        magUrl   = node.ChildNodes[1].ChildNodes[1].Attributes["href"].Value;
                                    }

                                    if (node.ChildNodes.Count >= 4)
                                    {
                                        sizePart = node.ChildNodes[3].InnerText.Trim();
                                        size     = FileSize.GetByteFromStr(sizePart);
                                    }

                                    if (node.ChildNodes.Count >= 5)
                                    {
                                        datePart = node.ChildNodes[5].InnerText.Trim();
                                    }

                                    ret.Add(new SeedMagnetSearchModel()
                                    {
                                        CompleteCount = 0,
                                        Date          = DateTime.Parse(datePart),
                                        Size          = size,
                                        MagUrl        = magUrl,
                                        Source        = SearchSeedSiteEnum.JavBus,
                                        Title         = namePart,
                                        Url           = ""
                                    });
                                }
                            }
                            catch (Exception)
                            {
                            }
                        }
                    }
                }
            }

            return(ret);
        }
Exemplo n.º 3
0
        private static Utils.HtmlResponse JavCookieContanierHelper(string url)
        {
            var htmlRes = HtmlManager.GetHtmlWebClientWithRenewCC("http://www.javlibrary.com/cn/", url, cc);

            return(htmlRes);
        }
Exemplo n.º 4
0
        /// <summary>
        ///     Initializes a new instance of the <see cref = "Crawler" /> class.
        /// </summary>
        public Crawler(ApplicationSettings applicationSettings, WebSettings webSettings, CrawlMode crawlMode, List <CrawlerPeer> crawlerPeers, List <DatabasePeer> databasePeers, bool enableRenderers)
        {
            Guid = Guid.NewGuid();

            try
            {
                _applicationSettings = applicationSettings;
                _webSettings         = webSettings;

                _arachnodeDAO        = (TArachnodeDAO)Activator.CreateInstance(typeof(TArachnodeDAO), _applicationSettings.ConnectionString, _applicationSettings, _webSettings, true, true);
                _applicationSettings = _arachnodeDAO.ApplicationSettings;

                _consoleManager = new ConsoleManager <TArachnodeDAO>(_applicationSettings, _webSettings);

                _consoleManager.OutputString("arachnode.net " + Assembly.GetExecutingAssembly().GetName().Version, ConsoleColor.Green, ConsoleColor.Gray);

                _actionManager = new ActionManager <TArachnodeDAO>(_applicationSettings, _webSettings, _consoleManager);
                _ruleManager   = new RuleManager <TArachnodeDAO>(_applicationSettings, _webSettings, _consoleManager);

                _memoryManager = new MemoryManager <TArachnodeDAO>(_applicationSettings, _webSettings);
                _cacheManager  = new CacheManager <TArachnodeDAO>(_applicationSettings, _webSettings);

                _cookieManager = new CookieManager();
                _cacheManager  = new CacheManager <TArachnodeDAO>(_applicationSettings, _webSettings);

                CrawlerPeers  = crawlerPeers;
                DatabasePeers = databasePeers;

                _crawlerPeerManager  = new CrawlerPeerManager <TArachnodeDAO>(_applicationSettings, _webSettings, CrawlerPeers, (TArachnodeDAO)Activator.CreateInstance(typeof(TArachnodeDAO), _applicationSettings.ConnectionString, _applicationSettings, _webSettings, true, true));
                _databasePeerManager = new DatabasePeerManager <TArachnodeDAO>(_applicationSettings, _webSettings, DatabasePeers);

                _cache = new Cache <TArachnodeDAO>(_applicationSettings, _webSettings, this, _actionManager, _cacheManager, _crawlerPeerManager, _memoryManager, _ruleManager);

                _dataTypeManager     = new DataTypeManager <TArachnodeDAO>(_applicationSettings, _webSettings);
                _discoveryManager    = new DiscoveryManager <TArachnodeDAO>(_applicationSettings, _webSettings, _cache, _actionManager, _cacheManager, _memoryManager, _ruleManager);
                _crawlRequestManager = new CrawlRequestManager <TArachnodeDAO>(_applicationSettings, _webSettings, _cache, _consoleManager, _discoveryManager);
                _encodingManager     = new EncodingManager <TArachnodeDAO>(_applicationSettings, _webSettings);
                _htmlManager         = new HtmlManager <TArachnodeDAO>(_applicationSettings, _webSettings, _discoveryManager);
                _politenessManager   = new PolitenessManager <TArachnodeDAO>(_applicationSettings, _webSettings, _cache);
                _proxyManager        = new ProxyManager <TArachnodeDAO>(_applicationSettings, _webSettings, _consoleManager);
                _reportingManager    = new ReportingManager <TArachnodeDAO>(_applicationSettings, _webSettings, _consoleManager);

                //create required directories...
                if (!Directory.Exists(_applicationSettings.ConsoleOutputLogsDirectory))
                {
                    Directory.CreateDirectory(_applicationSettings.ConsoleOutputLogsDirectory);
                }

                if (!Directory.Exists(_applicationSettings.DownloadedFilesDirectory))
                {
                    Directory.CreateDirectory(_applicationSettings.DownloadedFilesDirectory);
                }

                if (!Directory.Exists(_applicationSettings.DownloadedImagesDirectory))
                {
                    Directory.CreateDirectory(_applicationSettings.DownloadedImagesDirectory);
                }

                if (!Directory.Exists(_applicationSettings.DownloadedWebPagesDirectory))
                {
                    Directory.CreateDirectory(_applicationSettings.DownloadedWebPagesDirectory);
                }

                QueryProcessor = new QueryProcessor <TArachnodeDAO>();

                _consoleManager.OutputString("Crawler: Initializing Configuration/Database Connection.", ConsoleColor.White, ConsoleColor.Gray);

                LoadCrawlActions(_arachnodeDAO);
                LoadCrawlRules(_arachnodeDAO);

                AreRenderersEnabled = enableRenderers;

                Engine = new Engine <TArachnodeDAO>(_applicationSettings, _webSettings, this, _cache, _actionManager, _cacheManager, _consoleManager, _cookieManager, _crawlRequestManager, _dataTypeManager, _discoveryManager, _encodingManager, _htmlManager, _memoryManager, _politenessManager, _proxyManager, _reportingManager, _ruleManager, enableRenderers, (TArachnodeDAO)Activator.CreateInstance(typeof(TArachnodeDAO), _applicationSettings.ConnectionString, _applicationSettings, _webSettings, true, true));

                CrawlMode = crawlMode;

                /**/

                if (CrawlerPeerManager != null && CrawlerPeerManager.CrawlerPeers != null && CrawlerPeerManager.CrawlerPeers.Count != 0)
                {
                    ConsoleManager.OutputString("Crawler: Starting CrawlerPeerManager Server", ConsoleColor.White, ConsoleColor.Gray);

                    CrawlerPeerManager.StartServer(this, _arachnodeDAO);

                    _crawlerPeerManager.SendStatusMessageToCrawlerPeers(_arachnodeDAO);
                }

                /**/

                if (Debugger.IsAttached)
                {
                    _consoleManager.OutputString("Debugger: Attached - Expect Performance Degradation.", ConsoleColor.Yellow, ConsoleColor.Gray);
                }

                //update all core/components/managers with the updated ApplicationSettings...
#if DEMO
                Engine.CrawlRequestCompleted += Engine_CrawlRequestCompleted;

                _stopwatch.Start();
#endif
            }
            catch (InvalidConfigurationException invalidConfigurationException)
            {
                ProcessException(invalidConfigurationException);

                throw new InvalidConfigurationException(invalidConfigurationException.ApplicationSettings, invalidConfigurationException.WebSettings, invalidConfigurationException.Message, InvalidConfigurationExceptionSeverity.Error);
            }
            catch (Exception exception)
            {
                ProcessException(exception);

                throw new Exception(exception.Message, exception);
            }
        }
Exemplo n.º 5
0
        public static MangaDetailVM GetMangaDetailHanhan(MangaCategorySourceType sourceType, string path)
        {
            MangaDetailVM ret = new MangaDetailVM();

            ret.Chapters = new List <MangaChapter>();

            var htmlRet = HtmlManager.GetHtmlWebClient("http://www.hanhan.net", path);

            if (htmlRet.Success)
            {
                try
                {
                    HtmlDocument document = new HtmlDocument();
                    document.LoadHtml(htmlRet.Content);

                    var picPath     = "//img[@class='pic']";
                    var detailPath  = "//ul[@class='detail-list cf']/li";
                    var infoPath    = "//div[@id='intro-all']//p";
                    var chapterPath = "//ul[@id='chapter-list-4']//a";

                    var picNode  = document.DocumentNode.SelectSingleNode(picPath);
                    var infoNode = document.DocumentNode.SelectSingleNode(infoPath);

                    var chapterNodes = document.DocumentNode.SelectNodes(chapterPath);
                    var detailNodes  = document.DocumentNode.SelectNodes(detailPath);

                    if (picNode != null)
                    {
                        ret.PicUrl    = picNode.Attributes["src"].Value.Trim();
                        ret.MangaName = picNode.Attributes["alt"].Value.Trim();
                    }

                    if (detailNodes != null && detailNodes.Count > 0)
                    {
                        foreach (var node in detailNodes)
                        {
                            if (node.Attributes.Count <= 0)
                            {
                                foreach (var subNode in node.ChildNodes)
                                {
                                    if (subNode.InnerText.StartsWith("漫画作者:"))
                                    {
                                        ret.Author = subNode.InnerText.Replace("漫画作者:", "");
                                    }
                                }
                            }
                            else
                            {
                                ret.MangaStatus = node.ChildNodes.FindFirst("a").InnerHtml;

                                ret.UpdateDate = DateTime.Parse(node.ChildNodes[1].ChildNodes[5].InnerText);
                                ret.UpdateInfo = "更新到:" + node.ChildNodes[1].ChildNodes[7].InnerText;
                            }
                        }
                    }

                    if (infoNode != null)
                    {
                        ret.Description = infoNode.InnerText.Trim();
                    }

                    if (chapterNodes != null)
                    {
                        foreach (var node in chapterNodes)
                        {
                            ret.Chapters.Add(new MangaChapter
                            {
                                Url         = "http://www.hanhande.net" + node.Attributes["href"].Value.Trim(),
                                ChapterName = node.ChildNodes.FindFirst("span").InnerText.Trim()
                            });
                        }
                    }
                }
                catch (Exception e)
                {
                    ret.MsgCode = VMCode.Exception;
                    ret.Msg     = e.ToString();
                }
            }
            else
            {
                ret.MsgCode = VMCode.Success;
                ret.Msg     = "网页获取失败";
            }

            return(ret);
        }
Exemplo n.º 6
0
        private static void ScanEachAvSingleThread()
        {
            List <ScanURL> urls  = JavDataBaseManager.GetScanURL().Where(x => x.IsDownload == false).ToList();
            int            index = 0;

            foreach (var url in urls)
            {
                int retry = 1;
                //二次确认
                if (!JavDataBaseManager.HasAv(url.URL))
                {
                    var htmlRes = new Utils.HtmlResponse();

                    //最多重试5次
                    while (retry <= 5)
                    {
                        htmlRes = HtmlManager.GetHtmlWebClientWithRenewCC("http://www.javlibrary.com/cn/", url.URL, cc);

                        if (htmlRes.IsExpire)
                        {
                            GetJavCookie();
                            retry++;
                            continue;
                        }
                        else
                        {
                            break;
                        }
                    }

                    if (htmlRes.Success)
                    {
                        index++;
                        HtmlAgilityPack.HtmlDocument htmlDocument = new HtmlAgilityPack.HtmlDocument();
                        htmlDocument.LoadHtml(htmlRes.Content);

                        var av = GenerateAVModel(htmlRes.Content, url.URL);

                        JavDataBaseManager.InsertAV(av);
                        Console.WriteLine("线程 " + Thread.CurrentThread.ManagedThreadId.ToString() + " => 插入AV => " + av.ID + " - " + av.Name);
                        JavDataBaseManager.UpdateScanURL(url.URL);

                        string result = "";
                        if (!File.Exists(ImgFolder + av.ID + av.Name + ".jpg"))
                        {
                            result = DownloadHelper.DownloadHttps(av.PictureURL, ImgFolder + av.ID + av.Name + ".jpg", "");

                            if (string.IsNullOrEmpty(result))
                            {
                                Console.WriteLine("线程 " + Thread.CurrentThread.ManagedThreadId.ToString() + " => 下载AV图片成功 => " + av.ID + " - " + av.Name);
                            }
                            else
                            {
                                Console.WriteLine(result);
                            }
                        }
                        else
                        {
                            Console.WriteLine("已存在图片不下载");
                        }

                        Console.WriteLine("完成" + index + " / " + urls.Count);
                    }
                }
                else
                {
                    JavDataBaseManager.UpdateScanURL(url.URL);
                    Console.WriteLine("详情页已下载 => " + url.URL + " 完成" + index + " / " + urls.Count);
                }
            }
        }
 public static string getHTML()
 {
     HtmlManager.Clean();
     return(HtmlManager.createHtml(fileName));
 }
Exemplo n.º 8
0
        private static void TestJavLibrarySearch(string id)
        {
            var cc = InitManager.UpdateCookie(null, "http://www.javlibrary.com/cn").CC;

            var res = HtmlManager.GetHtmlContentViaUrl("http://www.javlibrary.com/cn/vl_searchbyid.php?keyword=" + id, "utf-8", true, cc);
        }
Exemplo n.º 9
0
        /// <summary>
        ///     Process a range of WebPageID after crawling.  Useful if crawled WebPages were not processed at crawl time according to desired ApplicationSettings configuration.
        ///     Calling this method DOES change the 'LastDiscovered' fields where applicable.
        ///     This method is not when crawling, rather during post-processing.
        /// </summary>
        /// <param name = "webPageIDLowerBound"></param>
        /// <param name = "webPageIDUpperBound"></param>
        public static void ProcessWebPages(Crawler <TArachnodeDAO> crawler, long webPageIDLowerBound, long webPageIDUpperBound)
        {
            //do not assign the application settings.  doing so will override the ApplicationSetting you set before calling this method...
            TArachnodeDAO arachnodeDAO = (TArachnodeDAO)Activator.CreateInstance(typeof(TArachnodeDAO), crawler.ApplicationSettings.ConnectionString, crawler.ApplicationSettings, crawler.WebSettings, false, false);

            ConsoleManager <TArachnodeDAO> consoleManager = new ConsoleManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings);
            ActionManager <TArachnodeDAO>  actionManager  = new ActionManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager);
            CookieManager cookieManager = new CookieManager();;
            MemoryManager <TArachnodeDAO>      memoryManager      = new MemoryManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings);
            RuleManager <TArachnodeDAO>        ruleManager        = new RuleManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager);
            CacheManager <TArachnodeDAO>       cacheManager       = new CacheManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings);
            CrawlerPeerManager <TArachnodeDAO> crawlerPeerManager = new CrawlerPeerManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, null, arachnodeDAO);
            Cache <TArachnodeDAO>            cache            = new Cache <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, crawler, actionManager, cacheManager, crawlerPeerManager, memoryManager, ruleManager);
            DiscoveryManager <TArachnodeDAO> discoveryManager = new DiscoveryManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, cache, actionManager, cacheManager, memoryManager, ruleManager);
            HtmlManager <TArachnodeDAO>      htmlManager      = new HtmlManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, discoveryManager);

            //load the CrawlActions, CrawlRules and EngineActions...
            ruleManager.ProcessCrawlRules(crawler);
            actionManager.ProcessCrawlActions(crawler);
            actionManager.ProcessEngineActions(crawler);

            //these three methods are called in the Engine.
            UserDefinedFunctions.RefreshAllowedExtensions(true);
            UserDefinedFunctions.RefreshAllowedSchemes(true);
            UserDefinedFunctions.RefreshDisallowed();

            //instantiate a WebClient to access the ResponseHeaders...
            WebClient <TArachnodeDAO> webClient = new WebClient <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager, cookieManager, new ProxyManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager));

            webClient.GetHttpWebResponse("http://google.com", "GET", null, null, null, null);

            WebPageManager <TArachnodeDAO> webPageManager = new WebPageManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, discoveryManager, htmlManager, arachnodeDAO);

            for (long i = webPageIDLowerBound; i <= webPageIDUpperBound; i++)
            {
                ArachnodeDataSet.WebPagesRow webPagesRow = null;

                try
                {
                    //get the WebPage from the database.  we need the source data as we don't store this in the index.
                    //even though most of the fields are available in the Document, the WebPage is the authoritative source, so we'll use that for all of the fields.
                    webPagesRow = arachnodeDAO.GetWebPage(i.ToString());

                    if (webPagesRow != null)
                    {
                        if (webPagesRow.Source == null || webPagesRow.Source.Length == 0)
                        {
                            if (File.Exists(discoveryManager.GetDiscoveryPath(crawler.ApplicationSettings.DownloadedWebPagesDirectory, webPagesRow.AbsoluteUri, webPagesRow.FullTextIndexType)))
                            {
                                using (StreamReader streamReader = File.OpenText(discoveryManager.GetDiscoveryPath(crawler.ApplicationSettings.DownloadedWebPagesDirectory, webPagesRow.AbsoluteUri, webPagesRow.FullTextIndexType)))
                                {
                                    webPagesRow.Source = Encoding.UTF8.GetBytes(streamReader.ReadToEnd());
                                }
                            }
                            else
                            {
                                Console.WriteLine("WebPageID: " + i + " was NOT processed successfully.");
                                if (OnWebPageProcessed != null)
                                {
                                    OnWebPageProcessed.BeginInvoke(webPagesRow, "WebPageID: " + i + " was NOT processed successfully.", null, null);
                                }
                            }
                        }

                        ProcessWebPage(crawler.ApplicationSettings, crawler.WebSettings, crawler, webPagesRow, webClient, cache, actionManager, consoleManager, crawlerPeerManager, discoveryManager, memoryManager, ruleManager, webPageManager, arachnodeDAO);

                        Console.WriteLine("WebPageID: " + i + " was processed successfully.");
                        if (OnWebPageProcessed != null)
                        {
                            OnWebPageProcessed.BeginInvoke(webPagesRow, "WebPageID: " + i + " was processed successfully.", null, null);
                        }
                    }
                }
                catch (Exception exception)
                {
                    Console.WriteLine("WebPageID: " + i + " was NOT processed successfully.");
                    Console.WriteLine(exception.Message);

                    if (OnWebPageProcessed != null)
                    {
                        OnWebPageProcessed.BeginInvoke(webPagesRow, "WebPageID: " + i + " was NOT processed successfully.", null, null);
                        OnWebPageProcessed.BeginInvoke(webPagesRow, exception.Message, null, null);
                    }

                    arachnodeDAO.InsertException(null, null, exception, false);
                }
            }

            //stop the CrawlActions, CrawlRules and EngineActions...
            ruleManager.Stop();
            actionManager.Stop();
        }
Exemplo n.º 10
0
 /// <summary>
 ///     The WebPageManager.
 /// </summary>
 /// <param name = "arachnodeDAO">Must be thread-safe.</param>
 protected AWebPageManager(ApplicationSettings applicationSettings, WebSettings webSettings, DiscoveryManager <TArachnodeDAO> discoveryManager, HtmlManager <TArachnodeDAO> htmlManager, IArachnodeDAO arachnodeDAO) : base(applicationSettings, webSettings)
 {
     _discoveryManager = discoveryManager;
     _htmlManager      = htmlManager;
     _arachnodeDAO     = arachnodeDAO;
 }
Exemplo n.º 11
0
        /// <summary>
        ///     Processes a WebPagesRow after crawling.
        /// </summary>
        /// <param name = "webPagesRow">The web pages row.</param>
        /// <param name="webClient"></param>
        /// <param name="actionManager"></param>
        /// <param name="consoleManager"></param>
        /// <param name="discoveryManager"></param>
        /// <param name="memoryManager"></param>
        /// <param name="ruleManager"></param>
        /// <param name = "webPageManager">The web page manager.</param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        /// <param name = "fileManager">The file manager.</param>
        /// <param name = "imageManager">The image manager.</param>
        public static void ProcessWebPage(ApplicationSettings applicationSettings, WebSettings webSettings, Crawler <TArachnodeDAO> crawler, ArachnodeDataSet.WebPagesRow webPagesRow, WebClient <TArachnodeDAO> webClient, Cache <TArachnodeDAO> cache, ActionManager <TArachnodeDAO> actionManager, ConsoleManager <TArachnodeDAO> consoleManager, CrawlerPeerManager <TArachnodeDAO> crawlerPeerManager, DiscoveryManager <TArachnodeDAO> discoveryManager, MemoryManager <TArachnodeDAO> memoryManager, RuleManager <TArachnodeDAO> ruleManager, WebPageManager <TArachnodeDAO> webPageManager, IArachnodeDAO arachnodeDAO)
        {
            CacheManager <TArachnodeDAO> cacheManager = new CacheManager <TArachnodeDAO>(applicationSettings, webSettings);
            CookieManager cookieManager = new CookieManager();
            CrawlRequestManager <TArachnodeDAO> crawlRequestManager = new CrawlRequestManager <TArachnodeDAO>(applicationSettings, webSettings, cache, consoleManager, discoveryManager);
            DataTypeManager <TArachnodeDAO>     dataTypeManager     = new DataTypeManager <TArachnodeDAO>(applicationSettings, webSettings);
            EncodingManager <TArachnodeDAO>     encodingManager     = new EncodingManager <TArachnodeDAO>(applicationSettings, webSettings);
            PolitenessManager <TArachnodeDAO>   politenessManager   = new PolitenessManager <TArachnodeDAO>(applicationSettings, webSettings, cache);
            ProxyManager <TArachnodeDAO>        proxyManager        = new ProxyManager <TArachnodeDAO>(applicationSettings, webSettings, consoleManager);
            HtmlManager <TArachnodeDAO>         htmlManager         = new HtmlManager <TArachnodeDAO>(applicationSettings, webSettings, discoveryManager);
            Crawl <TArachnodeDAO> crawl = new Crawl <TArachnodeDAO>(applicationSettings, webSettings, crawler, actionManager, consoleManager, cookieManager, crawlRequestManager, dataTypeManager, discoveryManager, encodingManager, htmlManager, politenessManager, proxyManager, ruleManager, true);

            //create a CrawlRequest as this is what the internals of SiteCrawler.dll expect to operate on...
            CrawlRequest <TArachnodeDAO> crawlRequest = new CrawlRequest <TArachnodeDAO>(new Discovery <TArachnodeDAO>(webPagesRow.AbsoluteUri), webPagesRow.CrawlDepth, UriClassificationType.Host, UriClassificationType.Host, 0, RenderType.None, RenderType.None);

            crawlRequest.Crawl = crawl;
            crawlRequest.Discovery.DiscoveryType = DiscoveryType.WebPage;
            crawlRequest.Discovery.ID            = webPagesRow.ID;
            crawlRequest.Data         = webPagesRow.Source;
            crawlRequest.CurrentDepth = webPagesRow.CrawlDepth;
            crawlRequest.Encoding     = Encoding.GetEncoding(webPagesRow.CodePage);
            crawlRequest.ProcessData  = true;
            crawlRequest.WebClient    = webClient;

            crawlRequest.WebClient.HttpWebResponse.Headers.Clear();

            //parse the ResponseHeaders from the WebPagesRow.ResponseHeaders string...
            foreach (string responseHeader in webPagesRow.ResponseHeaders.Split("\r\n".ToCharArray(), StringSplitOptions.RemoveEmptyEntries))
            {
                string[] responseHeaderSplit = responseHeader.Split(":".ToCharArray());

                string name  = responseHeaderSplit[0];
                string value = UserDefinedFunctions.ExtractResponseHeader(webPagesRow.ResponseHeaders, name, true).Value;

                crawlRequest.WebClient.HttpWebResponse.Headers.Add(name, value);
            }

            //refresh the DataTypes in the DataTypeManager... (if necessary)...
            if (dataTypeManager.AllowedDataTypes.Count == 0)
            {
                dataTypeManager.RefreshDataTypes();
            }

            crawlRequest.DataType = dataTypeManager.DetermineDataType(crawlRequest);

            //now, process the bytes...
            encodingManager.ProcessCrawlRequest(crawlRequest, arachnodeDAO);

            if (applicationSettings.InsertWebPages)
            {
                crawlRequest.Discovery.ID = arachnodeDAO.InsertWebPage(crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.WebClient.HttpWebResponse.Headers.ToString(), applicationSettings.InsertWebPageSource ? crawlRequest.Data : new byte[] { }, crawlRequest.Encoding.CodePage, crawlRequest.DataType.FullTextIndexType, crawlRequest.CurrentDepth, applicationSettings.ClassifyAbsoluteUris);
            }

            crawlRequest.ManagedDiscovery = webPageManager.ManageWebPage(crawlRequest.Discovery.ID.Value, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.Data, crawlRequest.Encoding, crawlRequest.DataType.FullTextIndexType, applicationSettings.ExtractWebPageMetaData, applicationSettings.InsertWebPageMetaData, applicationSettings.SaveDiscoveredWebPagesToDisk);

            //assigning FileAndImageDiscoveries isn't applicable because Files and Images need to be crawled to be properly classified... without classification we don't know whether they belong in dbo.Files or dbo.Images...
            crawlRequestManager.ProcessEmailAddresses(crawlRequest, arachnodeDAO);
            crawlRequestManager.ProcessHyperLinks(crawlRequest, arachnodeDAO);

            actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PostRequest, arachnodeDAO);

            discoveryManager.CloseAndDisposeManagedDiscovery(crawlRequest, arachnodeDAO);
        }
Exemplo n.º 12
0
        /// <summary>
        ///     Handles the Load event of the Page control.
        /// </summary>
        /// <param name = "sender">The source of the event.</param>
        /// <param name = "e">The <see cref = "System.EventArgs" /> instance containing the event data.</param>
        protected void Page_Load(object sender, EventArgs e)
        {
            if (Document != null)
            {
                string absoluteUri   = Document.GetField("absoluteuri").StringValue();
                string pageTitle     = Document.GetField("title").StringValue();
                string discoveryType = Document.GetField("discoverytype").StringValue();
                string discoveryID   = Document.GetField("discoveryid").StringValue();

                uxHlTitle.NavigateUrl = Document.GetField("absoluteuri").StringValue();
                if (!string.IsNullOrEmpty(pageTitle))
                {
                    uxHlTitle.Text = pageTitle.Length > WebSettings.MaximumPageTitleLength ? pageTitle.Substring(0, WebSettings.MaximumPageTitleLength) + "..." : pageTitle;
                }
                else
                {
                    if (!UserDefinedFunctions.ExtractFileName(absoluteUri).IsNull)
                    {
                        uxHlTitle.Text = UserDefinedFunctions.ExtractFileName(absoluteUri).Value.Length > WebSettings.MaximumPageTitleLength ? UserDefinedFunctions.ExtractFileName(absoluteUri).Value.Substring(0, WebSettings.MaximumPageTitleLength) + "..." : UserDefinedFunctions.ExtractFileName(absoluteUri).Value;
                    }
                    else
                    {
                        uxHlTitle.Text = absoluteUri;
                    }
                }
                if (discoveryType != "image")
                {
                    uxLblSummary.Text  = Summary;
                    uxImgImage.Visible = false;
                }
                else
                {
                    uxLblSummary.Visible = false;

                    uxImgImage.ImageUrl = HtmlManager.GetImageUrl(absoluteUri, Document.GetField("fulltextindextype").StringValue(), ArachnodeDAO);
                }
                uxLblAbsoluteUri.Text = absoluteUri;
                switch (discoveryType)
                {
                case "file":
                    uxHlCached.NavigateUrl = HtmlManager.GetFileUrl(absoluteUri, Document.GetField("fulltextindextype").StringValue(), ArachnodeDAO);
                    break;

                case "image":
                    uxHlCached.NavigateUrl = HtmlManager.GetImageUrl(absoluteUri, Document.GetField("fulltextindextype").StringValue(), ArachnodeDAO);
                    break;

                case "webpage":
                    uxHlBrowse.Visible     = true;
                    uxHlBrowse.NavigateUrl = "/Browse.aspx?discoveryID=" + discoveryID + "&absoluteUri=" + HttpUtility.UrlEncode(absoluteUri);
                    uxHlCached.NavigateUrl = "/Cached.aspx?discoveryID=" + discoveryID + "&absoluteUri=" + HttpUtility.UrlEncode(absoluteUri) + "&webPage=" + Encryption.EncryptRijndaelManaged(Document.GetField("discoverypath").StringValue()) + "&codePage=" + Document.GetField("codepage").StringValue() + "&fullTextIndexType=" + Document.GetField("fulltextindextype").StringValue();
                    break;
                }
                uxHlExplain.NavigateUrl = "/Explanation.aspx?query=" + Request.QueryString["query"] + "&absoluteUri=" + absoluteUri + "&documentID=" + Document.GetField("documentid").StringValue() + "&strength=" + Document.GetField("strength").StringValue();

                double score;

                double.TryParse(Document.GetField("relevancyscore").StringValue(), out score);

                double strength;

                double.TryParse(Document.GetField("strength").StringValue(), out strength);

                uxLblScoreAndStrength.Text = "Score:" + Math.Round(score, 2) + " Strength:" + Math.Round(strength, 2) + " = Total:" + (score * strength);
            }
        }
Exemplo n.º 13
0
 public LoginPage(AuthService authService, HtmlManager htmlService)
 {
     _authService = authService;
     _htmlService = htmlService;
     InitializeComponent();
 }
Exemplo n.º 14
0
        public static List <SeedMagnetSearchModel> SearchBtsow(string id)
        {
            List <SeedMagnetSearchModel> ret = new List <SeedMagnetSearchModel>();

            try
            {
                var serachContent = "https://btsow.space/search/" + id;
                var htmlRet       = HtmlManager.GetHtmlWebClient("https://btsow.space", serachContent, null, true);

                if (htmlRet.Success)
                {
                    HtmlDocument htmlDocument = new HtmlDocument();
                    htmlDocument.LoadHtml(htmlRet.Content);

                    string xpath = "//div[@class='row']";

                    HtmlNodeCollection nodes = htmlDocument.DocumentNode.SelectNodes(xpath);

                    foreach (var node in nodes.Take(nodes.Count - 1))
                    {
                        var text = node.ChildNodes[1].ChildNodes[1].InnerText.Trim();
                        var size = FileUtility.GetFileSizeFromString(node.ChildNodes[3].InnerText.Trim());
                        var date = node.ChildNodes[5].InnerText.Trim();
                        var a    = node.ChildNodes[1].OuterHtml;
                        var url  = a.Substring(a.IndexOf("\"") + 1);
                        url = url.Substring(0, url.IndexOf("\""));

                        SeedMagnetSearchModel temp = new SeedMagnetSearchModel
                        {
                            Title  = text,
                            Size   = size,
                            Date   = DateTime.Parse(date),
                            Url    = url,
                            Source = SearchSeedSiteEnum.Btsow
                        };

                        ret.Add(temp);
                    }

                    foreach (var r in ret)
                    {
                        var subHtmlRet = HtmlManager.GetHtmlContentViaUrl(r.Url);

                        if (subHtmlRet.Success)
                        {
                            htmlDocument = new HtmlDocument();
                            htmlDocument.LoadHtml(subHtmlRet.Content);

                            xpath = "//textarea[@class='magnet-link hidden-xs']";

                            HtmlNode node = htmlDocument.DocumentNode.SelectSingleNode(xpath);

                            if (node != null)
                            {
                                r.MagUrl = node.InnerText;
                            }
                        }
                    }
                }
            }
            catch (Exception ee)
            {
            }

            return(ret.OrderByDescending(x => x.Size).ToList());
        }
Exemplo n.º 15
0
        public JsonResult Add115Task(string mag)
        {
            CookieContainer cc  = new CookieContainer();
            bool            ret = false;
            string          msg = "";

            foreach (var t in JsonConvert.DeserializeObject <List <CookieItem> >(ScanDataBaseManager.GetOneOneFiveCookie().OneOneFiveCookie))
            {
                Cookie c = new Cookie(t.Name, t.Value, "/", "115.com");
                cc.Add(c);
            }

            var split = mag.Split(new string[] { "magnet:?" }, StringSplitOptions.None).Where(x => !string.IsNullOrEmpty(x));

            Dictionary <string, string> param = new Dictionary <string, string>();

            if (split.Count() <= 1)
            {
                param.Add("url", mag);
            }
            else
            {
                int index = 0;
                foreach (var s in split)
                {
                    param.Add(string.Format("url[{0}]", index), "magnet:?" + s);

                    index++;
                }
            }

            param.Add("sign", "");
            param.Add("uid", "340200422");
            param.Add("time", DateTime.Now.ToFileTimeUtc() + "");

            var returnStr = "";

            if (split.Count() <= 1)
            {
                returnStr = HtmlManager.Post("https://115.com/web/lixian/?ct=lixian&ac=add_task_url", param, cc);
            }
            else
            {
                returnStr = HtmlManager.Post("https://115.com/web/lixian/?ct=lixian&ac=add_task_urls", param, cc);
            }

            if (!string.IsNullOrEmpty(returnStr))
            {
                var data = Newtonsoft.Json.Linq.JObject.Parse(returnStr);

                bool.TryParse(data.Property("state").Value.ToString(), out ret);

                if (ret == false)
                {
                    msg = data.Property("error_msg").Value.ToString();
                }
            }

            if (string.IsNullOrEmpty(msg))
            {
                msg = "下载成功";
            }

            return(Json(new { status = ret, msg = msg }, JsonRequestBehavior.AllowGet));
        }
Exemplo n.º 16
0
        public static List <SeedMagnetSearchModel> SearchSukebei(string id, CookieContainer cc = null)
        {
            //if (cc == null)
            //{
            //    var c = HtmlManager.GetCookies("https://sukebei.nyaa.si/");
            //    cc = new CookieContainer();
            //    cc.Add(c);
            //}

            List <SeedMagnetSearchModel> ret = new List <SeedMagnetSearchModel>();

            try
            {
                //var serachContent = "https://sukebei.nyaa.pro/search/c_0_0_k_" + id;
                //var htmlRet = HtmlManager.GetHtmlWebClient("https://sukebei.nyaa.pro", serachContent, cc);

                var serachContent = "https://sukebei.nyaa.si?f=0&c=0_0&q=" + id;
                var htmlRet       = HtmlManager.GetHtmlWebClient("https://sukebei.nyaa.si", serachContent, cc);

                if (htmlRet.Success)
                {
                    HtmlDocument htmlDocument = new HtmlDocument();
                    htmlDocument.LoadHtml(htmlRet.Content);

                    string xpath = "//tr";

                    HtmlNodeCollection nodes = htmlDocument.DocumentNode.SelectNodes(xpath);

                    foreach (var node in nodes.Skip(1))
                    {
                        var text = FileUtility.ReplaceInvalidChar(node.ChildNodes[3].InnerText.Trim());
                        var a    = node.ChildNodes[5].OuterHtml;
                        var size = node.ChildNodes[7].InnerText.Trim();
                        var date = node.ChildNodes[9].OuterHtml.Trim().Replace("<td class=\"text-center\" data-timestamp=\"", "").Replace("\"></td>", "");
                        //var complete = node.ChildNodes[15].InnerText.Trim();

                        var url = a.Substring(a.IndexOf("<a href=\"magnet:?xt") + 9);
                        url = url.Substring(0, url.IndexOf("\""));

                        int seconds = 0;

                        int.TryParse(date, out seconds);

                        DateTime startTime = TimeZone.CurrentTimeZone.ToLocalTime(new System.DateTime(1970, 1, 1)); // 当地时区
                        DateTime dt        = startTime.AddSeconds(seconds);

                        SeedMagnetSearchModel temp = new SeedMagnetSearchModel
                        {
                            Title = text,
                            Size  = FileUtility.GetFileSizeFromString(size),
                            Date  = dt,
                            Url   = "",
                            //CompleteCount = int.Parse(complete),
                            MagUrl = url,
                            Source = SearchSeedSiteEnum.Sukebei
                        };

                        ret.Add(temp);
                    }
                }
            }
            catch (Exception ee)
            {
            }

            return(ret.Where(x => x.Size >= 0).OrderByDescending(x => x.CompleteCount).ThenByDescending(x => x.Size).ToList());
        }
Exemplo n.º 17
0
        private static void ScanCategoryPageUrlSingleThread(Dictionary <string, string> urls)
        {
            int index = 1;

            foreach (var url in urls)
            {
                int retry   = 1;
                var htmlRes = new Utils.HtmlResponse();

                //如果取不到cookie最多重试5次
                while (retry <= 5)
                {
                    htmlRes = HtmlManager.GetHtmlWebClientWithRenewCC("http://www.javlibrary.com/cn/", url.Key, cc);

                    if (htmlRes.IsExpire)
                    {
                        GetJavCookie();
                        retry++;
                        continue;
                    }
                    else
                    {
                        break;
                    }
                }

                if (htmlRes.Success)
                {
                    HtmlAgilityPack.HtmlDocument htmlDocument = new HtmlAgilityPack.HtmlDocument();
                    htmlDocument.LoadHtml(htmlRes.Content);

                    var videoPath  = "//div[@class='video']";
                    var videoNodes = htmlDocument.DocumentNode.SelectNodes(videoPath);

                    if (videoNodes != null)
                    {
                        int unScanCount = 0;
                        foreach (var node in videoNodes)
                        {
                            var urlAndTitle = node.ChildNodes[0];
                            if (urlAndTitle != null && urlAndTitle.ChildNodes.Count >= 3)
                            {
                                var id    = urlAndTitle.ChildNodes[0].InnerText.Trim();
                                var name  = FileUtility.ReplaceInvalidChar(urlAndTitle.ChildNodes[2].InnerText.Trim());
                                var avUrl = urlAndTitle.Attributes["href"].Value.Trim().Replace("./", "http://www.javlibrary.com/cn/");

                                if (!string.IsNullOrEmpty(avUrl) && !string.IsNullOrEmpty(name) && !string.IsNullOrWhiteSpace(id))
                                {
                                    ScanURL scan = new ScanURL
                                    {
                                        Category   = url.Value,
                                        ID         = id,
                                        IsDownload = false,
                                        Title      = name,
                                        URL        = avUrl
                                    };

                                    if (!JavDataBaseManager.HasScan(scan))
                                    {
                                        unScanCount++;
                                        JavDataBaseManager.InsertScanURL(scan);
                                    }
                                }
                            }
                        }

                        Console.WriteLine(url.Value + " 第 " + index + " / " + urls.Count + " 页, 加入" + unScanCount + " 条未扫描AV");

                        index++;
                    }
                }
                else
                {
                    Console.WriteLine("获取列表页 " + url.Key + " 内容失败");
                }
            }
        }
Exemplo n.º 18
0
        public static AV GetJavBusSearchDetail(string url, CookieContainer cc, Dictionary <string, string> mapping)
        {
            AV av = new AV();

            var listHtml = HtmlManager.GetHtmlContentViaUrl(url, "utf-8", false, cc);

            if (listHtml.Success)
            {
                var titleTemplate     = "<h3>(.*?)</h3>";
                var imgTemplate       = "<a class=\"bigImage\" href=\"(.*?)\">";
                var idTemplate        = "<span style=\"color:#CC0000;\">(.*?)</span>";
                var dateTemplate      = "<p><span class=\"header\">發行日期:</span>(.*?)</p>";
                var directorTemplate  = "<p><span class=\"header\">導演:</span> <a href=\"(.*?)\">(.*?)</a></p>";
                var lengthTemplate    = "<p><span class=\"header\">長度:</span>(.*?)分鐘</p>";
                var actressTemplate   = "<div class=\"star-name\"><a href=\"(.*?)\" title=\"(.*?)\">(.*?)</a></div>";
                var companyTemplate   = "<p><span class=\"header\">製作商:</span> <a href=\"(.*?)\">(.*?)</a>";
                var publisherTemplate = "<p><span class=\"header\">發行商:</span> <a href=\"(.*?)\">(.*?)</a>";
                var categotyTemplate  = "<span class=\"genre\"><a href=\"(.*?)\">(.*?)</a></span>";

                var mTitle  = Regex.Match(listHtml.Content, titleTemplate);
                var mId     = Regex.Match(listHtml.Content, idTemplate);
                var mImg    = Regex.Match(listHtml.Content, imgTemplate);
                var mDate   = Regex.Match(listHtml.Content, dateTemplate);
                var mLength = Regex.Match(listHtml.Content, lengthTemplate);

                var mDirector  = Regex.Matches(listHtml.Content, directorTemplate);
                var mActress   = Regex.Matches(listHtml.Content, actressTemplate);
                var mCompany   = Regex.Matches(listHtml.Content, companyTemplate);
                var mPublisher = Regex.Matches(listHtml.Content, publisherTemplate);
                var mCategory  = Regex.Matches(listHtml.Content, categotyTemplate);

                var id     = mId.Groups[1];
                var title  = mTitle.Groups[1].ToString().Replace(id.ToString(), "").Trim();
                var img    = mImg.Groups[1];
                var date   = mDate.Groups[1];
                var length = mLength.Groups[1];

                var director  = "";
                var actress   = "";
                var company   = "";
                var publisher = "";
                var category  = "";

                foreach (System.Text.RegularExpressions.Match d in mDirector)
                {
                    director += d.Groups[2] + ",";
                }

                foreach (System.Text.RegularExpressions.Match d in mActress)
                {
                    var act = d.Groups[3].ToString();

                    actress += act + ",";
                }

                foreach (System.Text.RegularExpressions.Match d in mCompany)
                {
                    company += d.Groups[2] + ",";
                }

                foreach (System.Text.RegularExpressions.Match d in mPublisher)
                {
                    publisher += d.Groups[2] + ",";
                }

                foreach (System.Text.RegularExpressions.Match d in mCategory)
                {
                    category += d.Groups[2] + ",";
                }

                DateTime parse = new DateTime(2050, 1, 1);
                av.Name       = title;
                av.ID         = id.ToString();
                av.PictureURL = img.ToString();
                av.Publisher  = publisher;
                DateTime.TryParse(date.ToString(), out parse);
                av.ReleaseDate = parse;
                av.Director    = director;
                av.Actress     = actress;
                av.Company     = company;
                av.Category    = category;
                av.AvLength    = int.Parse(length.ToString());

                return(av);
            }

            return(null);
        }
Exemplo n.º 19
0
        private static MangaCategoryListVM GetManageCategoryListHanhan(string[] category, int page)
        {
            MangaCategoryListVM ret = new MangaCategoryListVM();

            ret.Mangas      = new List <MangaCategoryListItem>();
            ret.CurrentPage = page;

            var urlPrefix = "http://www.hanhande.net/";
            var url       = "list";
            var searchUrl = "";

            if (category.Length == 1 && string.IsNullOrEmpty(category[0]))
            {
                url += "_" + page + "/";
            }
            else
            {
                url += "/" + string.Join("-", category) + "/" + page + "/";
            }

            searchUrl = urlPrefix + url;
            var htmlRet = HtmlManager.GetHtmlWebClient("http://www.hanhan.net", searchUrl);

            if (htmlRet.Success)
            {
                try
                {
                    HtmlDocument document = new HtmlDocument();
                    document.LoadHtml(htmlRet.Content);

                    var listPath = "//li[@class='item-lg']";
                    var pagePath = "//li[@class='last']";

                    var listNodes = document.DocumentNode.SelectNodes(listPath);
                    var pageNode  = document.DocumentNode.SelectSingleNode(pagePath);

                    if (pageNode != null && pageNode.ChildNodes.FindFirst("a") != null)
                    {
                        int total = -1;

                        int.TryParse(pageNode.ChildNodes.FindFirst("a").Attributes["data-page"].Value.Trim(), out total);

                        ret.TotalPage = total + 1;
                    }

                    if (listNodes != null)
                    {
                        foreach (var node in listNodes)
                        {
                            var aTag = node.ChildNodes.FindFirst("a");

                            if (aTag != null)
                            {
                                MangaCategoryListItem temp = new MangaCategoryListItem();

                                temp.MangaUrl  = aTag.Attributes["href"].Value.Trim();
                                temp.MangaName = aTag.Attributes["title"].Value.Trim();

                                foreach (var subNode in aTag.ChildNodes)
                                {
                                    if (subNode.Name == "img")
                                    {
                                        temp.PicUrl = subNode.Attributes["src"].Value.Trim();
                                    }

                                    if (subNode.Name == "span" && subNode.Attributes["class"].Value.Trim() == "fd")
                                    {
                                        temp.IsFinished = true;
                                    }

                                    if (subNode.Name == "span" && subNode.Attributes["class"].Value.Trim() == "tt")
                                    {
                                        temp.UpdateInfo = subNode.InnerHtml.Trim();
                                    }
                                }

                                foreach (var subNode in node.ChildNodes)
                                {
                                    DateTime updateDate = DateTime.Now;

                                    if (subNode.Name == "span" && subNode.Attributes["class"].Value.Trim() == "updateon")
                                    {
                                        DateTime.TryParse(subNode.InnerHtml.Substring(subNode.InnerHtml.IndexOf(":") + 1, subNode.InnerHtml.IndexOf(" ")), out updateDate);
                                        temp.UpdateDate = updateDate;
                                    }
                                }

                                ret.Mangas.Add(temp);
                            }
                        }
                    }
                }
                catch (Exception e)
                {
                    ret.MsgCode = VMCode.Exception;
                    ret.Msg     = e.ToString();
                }
            }
            else
            {
                ret.MsgCode = VMCode.Error;
                ret.Msg     = "网页获取失败";
            }

            ret.PageSize = ret.Mangas.Count;
            return(ret);
        }