public MainPage(HtmlManager connectionService) { ConnectionService = connectionService; InitializeComponent(); }
public static List <SeedMagnetSearchModel> SearchJavBus(string avId, CookieContainer cc = null) { List <SeedMagnetSearchModel> ret = new List <SeedMagnetSearchModel>(); var refere = "https://www.javbus.com/" + avId; var html = HtmlManager.GetHtmlContentViaUrl(refere, "utf-8", false, cc); if (html.Success) { var gidPattern = "var gid = (.*?);"; var ucPattern = "var uc = (.*?);"; var picPattern = "var img = '(.*?)';"; var gidMatch = Regex.Match(html.Content, gidPattern); var ucMatch = Regex.Match(html.Content, ucPattern); var picMatch = Regex.Match(html.Content, picPattern); var gid = gidMatch.Groups[1].Value; var uc = ucMatch.Groups[1].Value; var pic = picMatch.Groups[1].Value; var url = $"https://www.javbus.com/ajax/uncledatoolsbyajax.php?gid={gid}&lang=zh&img={pic}&uc={uc}&floor=922"; var magHtml = HtmlManager.GetHtmlWebClient(url, null, "javbus.com", "", refere); if (magHtml.Success) { HtmlDocument htmlDocument = new HtmlDocument(); htmlDocument.LoadHtml(magHtml.Content); var magPattern = "//tr[@style=' border-top:#DDDDDD solid 1px']"; HtmlNodeCollection nodes = htmlDocument.DocumentNode.SelectNodes(magPattern); if (nodes != null) { foreach (var node in nodes) { var namePart = ""; var sizePart = ""; var datePart = ""; var magUrl = ""; var size = 0d; try { if (node != null) { if (node.ChildNodes.Count >= 2) { namePart = node.ChildNodes[1].InnerText.Trim(); magUrl = node.ChildNodes[1].ChildNodes[1].Attributes["href"].Value; } if (node.ChildNodes.Count >= 4) { sizePart = node.ChildNodes[3].InnerText.Trim(); size = FileSize.GetByteFromStr(sizePart); } if (node.ChildNodes.Count >= 5) { datePart = node.ChildNodes[5].InnerText.Trim(); } ret.Add(new SeedMagnetSearchModel() { CompleteCount = 0, Date = DateTime.Parse(datePart), Size = size, MagUrl = magUrl, Source = SearchSeedSiteEnum.JavBus, Title = namePart, Url = "" }); } } catch (Exception) { } } } } } return(ret); }
private static Utils.HtmlResponse JavCookieContanierHelper(string url) { var htmlRes = HtmlManager.GetHtmlWebClientWithRenewCC("http://www.javlibrary.com/cn/", url, cc); return(htmlRes); }
/// <summary> /// Initializes a new instance of the <see cref = "Crawler" /> class. /// </summary> public Crawler(ApplicationSettings applicationSettings, WebSettings webSettings, CrawlMode crawlMode, List <CrawlerPeer> crawlerPeers, List <DatabasePeer> databasePeers, bool enableRenderers) { Guid = Guid.NewGuid(); try { _applicationSettings = applicationSettings; _webSettings = webSettings; _arachnodeDAO = (TArachnodeDAO)Activator.CreateInstance(typeof(TArachnodeDAO), _applicationSettings.ConnectionString, _applicationSettings, _webSettings, true, true); _applicationSettings = _arachnodeDAO.ApplicationSettings; _consoleManager = new ConsoleManager <TArachnodeDAO>(_applicationSettings, _webSettings); _consoleManager.OutputString("arachnode.net " + Assembly.GetExecutingAssembly().GetName().Version, ConsoleColor.Green, ConsoleColor.Gray); _actionManager = new ActionManager <TArachnodeDAO>(_applicationSettings, _webSettings, _consoleManager); _ruleManager = new RuleManager <TArachnodeDAO>(_applicationSettings, _webSettings, _consoleManager); _memoryManager = new MemoryManager <TArachnodeDAO>(_applicationSettings, _webSettings); _cacheManager = new CacheManager <TArachnodeDAO>(_applicationSettings, _webSettings); _cookieManager = new CookieManager(); _cacheManager = new CacheManager <TArachnodeDAO>(_applicationSettings, _webSettings); CrawlerPeers = crawlerPeers; DatabasePeers = databasePeers; _crawlerPeerManager = new CrawlerPeerManager <TArachnodeDAO>(_applicationSettings, _webSettings, CrawlerPeers, (TArachnodeDAO)Activator.CreateInstance(typeof(TArachnodeDAO), _applicationSettings.ConnectionString, _applicationSettings, _webSettings, true, true)); _databasePeerManager = new DatabasePeerManager <TArachnodeDAO>(_applicationSettings, _webSettings, DatabasePeers); _cache = new Cache <TArachnodeDAO>(_applicationSettings, _webSettings, this, _actionManager, _cacheManager, _crawlerPeerManager, _memoryManager, _ruleManager); _dataTypeManager = new DataTypeManager <TArachnodeDAO>(_applicationSettings, _webSettings); _discoveryManager = new DiscoveryManager <TArachnodeDAO>(_applicationSettings, _webSettings, _cache, _actionManager, _cacheManager, _memoryManager, _ruleManager); _crawlRequestManager = new CrawlRequestManager <TArachnodeDAO>(_applicationSettings, _webSettings, _cache, _consoleManager, _discoveryManager); _encodingManager = new EncodingManager <TArachnodeDAO>(_applicationSettings, _webSettings); _htmlManager = new HtmlManager <TArachnodeDAO>(_applicationSettings, _webSettings, _discoveryManager); _politenessManager = new PolitenessManager <TArachnodeDAO>(_applicationSettings, _webSettings, _cache); _proxyManager = new ProxyManager <TArachnodeDAO>(_applicationSettings, _webSettings, _consoleManager); _reportingManager = new ReportingManager <TArachnodeDAO>(_applicationSettings, _webSettings, _consoleManager); //create required directories... if (!Directory.Exists(_applicationSettings.ConsoleOutputLogsDirectory)) { Directory.CreateDirectory(_applicationSettings.ConsoleOutputLogsDirectory); } if (!Directory.Exists(_applicationSettings.DownloadedFilesDirectory)) { Directory.CreateDirectory(_applicationSettings.DownloadedFilesDirectory); } if (!Directory.Exists(_applicationSettings.DownloadedImagesDirectory)) { Directory.CreateDirectory(_applicationSettings.DownloadedImagesDirectory); } if (!Directory.Exists(_applicationSettings.DownloadedWebPagesDirectory)) { Directory.CreateDirectory(_applicationSettings.DownloadedWebPagesDirectory); } QueryProcessor = new QueryProcessor <TArachnodeDAO>(); _consoleManager.OutputString("Crawler: Initializing Configuration/Database Connection.", ConsoleColor.White, ConsoleColor.Gray); LoadCrawlActions(_arachnodeDAO); LoadCrawlRules(_arachnodeDAO); AreRenderersEnabled = enableRenderers; Engine = new Engine <TArachnodeDAO>(_applicationSettings, _webSettings, this, _cache, _actionManager, _cacheManager, _consoleManager, _cookieManager, _crawlRequestManager, _dataTypeManager, _discoveryManager, _encodingManager, _htmlManager, _memoryManager, _politenessManager, _proxyManager, _reportingManager, _ruleManager, enableRenderers, (TArachnodeDAO)Activator.CreateInstance(typeof(TArachnodeDAO), _applicationSettings.ConnectionString, _applicationSettings, _webSettings, true, true)); CrawlMode = crawlMode; /**/ if (CrawlerPeerManager != null && CrawlerPeerManager.CrawlerPeers != null && CrawlerPeerManager.CrawlerPeers.Count != 0) { ConsoleManager.OutputString("Crawler: Starting CrawlerPeerManager Server", ConsoleColor.White, ConsoleColor.Gray); CrawlerPeerManager.StartServer(this, _arachnodeDAO); _crawlerPeerManager.SendStatusMessageToCrawlerPeers(_arachnodeDAO); } /**/ if (Debugger.IsAttached) { _consoleManager.OutputString("Debugger: Attached - Expect Performance Degradation.", ConsoleColor.Yellow, ConsoleColor.Gray); } //update all core/components/managers with the updated ApplicationSettings... #if DEMO Engine.CrawlRequestCompleted += Engine_CrawlRequestCompleted; _stopwatch.Start(); #endif } catch (InvalidConfigurationException invalidConfigurationException) { ProcessException(invalidConfigurationException); throw new InvalidConfigurationException(invalidConfigurationException.ApplicationSettings, invalidConfigurationException.WebSettings, invalidConfigurationException.Message, InvalidConfigurationExceptionSeverity.Error); } catch (Exception exception) { ProcessException(exception); throw new Exception(exception.Message, exception); } }
public static MangaDetailVM GetMangaDetailHanhan(MangaCategorySourceType sourceType, string path) { MangaDetailVM ret = new MangaDetailVM(); ret.Chapters = new List <MangaChapter>(); var htmlRet = HtmlManager.GetHtmlWebClient("http://www.hanhan.net", path); if (htmlRet.Success) { try { HtmlDocument document = new HtmlDocument(); document.LoadHtml(htmlRet.Content); var picPath = "//img[@class='pic']"; var detailPath = "//ul[@class='detail-list cf']/li"; var infoPath = "//div[@id='intro-all']//p"; var chapterPath = "//ul[@id='chapter-list-4']//a"; var picNode = document.DocumentNode.SelectSingleNode(picPath); var infoNode = document.DocumentNode.SelectSingleNode(infoPath); var chapterNodes = document.DocumentNode.SelectNodes(chapterPath); var detailNodes = document.DocumentNode.SelectNodes(detailPath); if (picNode != null) { ret.PicUrl = picNode.Attributes["src"].Value.Trim(); ret.MangaName = picNode.Attributes["alt"].Value.Trim(); } if (detailNodes != null && detailNodes.Count > 0) { foreach (var node in detailNodes) { if (node.Attributes.Count <= 0) { foreach (var subNode in node.ChildNodes) { if (subNode.InnerText.StartsWith("漫画作者:")) { ret.Author = subNode.InnerText.Replace("漫画作者:", ""); } } } else { ret.MangaStatus = node.ChildNodes.FindFirst("a").InnerHtml; ret.UpdateDate = DateTime.Parse(node.ChildNodes[1].ChildNodes[5].InnerText); ret.UpdateInfo = "更新到:" + node.ChildNodes[1].ChildNodes[7].InnerText; } } } if (infoNode != null) { ret.Description = infoNode.InnerText.Trim(); } if (chapterNodes != null) { foreach (var node in chapterNodes) { ret.Chapters.Add(new MangaChapter { Url = "http://www.hanhande.net" + node.Attributes["href"].Value.Trim(), ChapterName = node.ChildNodes.FindFirst("span").InnerText.Trim() }); } } } catch (Exception e) { ret.MsgCode = VMCode.Exception; ret.Msg = e.ToString(); } } else { ret.MsgCode = VMCode.Success; ret.Msg = "网页获取失败"; } return(ret); }
private static void ScanEachAvSingleThread() { List <ScanURL> urls = JavDataBaseManager.GetScanURL().Where(x => x.IsDownload == false).ToList(); int index = 0; foreach (var url in urls) { int retry = 1; //二次确认 if (!JavDataBaseManager.HasAv(url.URL)) { var htmlRes = new Utils.HtmlResponse(); //最多重试5次 while (retry <= 5) { htmlRes = HtmlManager.GetHtmlWebClientWithRenewCC("http://www.javlibrary.com/cn/", url.URL, cc); if (htmlRes.IsExpire) { GetJavCookie(); retry++; continue; } else { break; } } if (htmlRes.Success) { index++; HtmlAgilityPack.HtmlDocument htmlDocument = new HtmlAgilityPack.HtmlDocument(); htmlDocument.LoadHtml(htmlRes.Content); var av = GenerateAVModel(htmlRes.Content, url.URL); JavDataBaseManager.InsertAV(av); Console.WriteLine("线程 " + Thread.CurrentThread.ManagedThreadId.ToString() + " => 插入AV => " + av.ID + " - " + av.Name); JavDataBaseManager.UpdateScanURL(url.URL); string result = ""; if (!File.Exists(ImgFolder + av.ID + av.Name + ".jpg")) { result = DownloadHelper.DownloadHttps(av.PictureURL, ImgFolder + av.ID + av.Name + ".jpg", ""); if (string.IsNullOrEmpty(result)) { Console.WriteLine("线程 " + Thread.CurrentThread.ManagedThreadId.ToString() + " => 下载AV图片成功 => " + av.ID + " - " + av.Name); } else { Console.WriteLine(result); } } else { Console.WriteLine("已存在图片不下载"); } Console.WriteLine("完成" + index + " / " + urls.Count); } } else { JavDataBaseManager.UpdateScanURL(url.URL); Console.WriteLine("详情页已下载 => " + url.URL + " 完成" + index + " / " + urls.Count); } } }
public static string getHTML() { HtmlManager.Clean(); return(HtmlManager.createHtml(fileName)); }
private static void TestJavLibrarySearch(string id) { var cc = InitManager.UpdateCookie(null, "http://www.javlibrary.com/cn").CC; var res = HtmlManager.GetHtmlContentViaUrl("http://www.javlibrary.com/cn/vl_searchbyid.php?keyword=" + id, "utf-8", true, cc); }
/// <summary> /// Process a range of WebPageID after crawling. Useful if crawled WebPages were not processed at crawl time according to desired ApplicationSettings configuration. /// Calling this method DOES change the 'LastDiscovered' fields where applicable. /// This method is not when crawling, rather during post-processing. /// </summary> /// <param name = "webPageIDLowerBound"></param> /// <param name = "webPageIDUpperBound"></param> public static void ProcessWebPages(Crawler <TArachnodeDAO> crawler, long webPageIDLowerBound, long webPageIDUpperBound) { //do not assign the application settings. doing so will override the ApplicationSetting you set before calling this method... TArachnodeDAO arachnodeDAO = (TArachnodeDAO)Activator.CreateInstance(typeof(TArachnodeDAO), crawler.ApplicationSettings.ConnectionString, crawler.ApplicationSettings, crawler.WebSettings, false, false); ConsoleManager <TArachnodeDAO> consoleManager = new ConsoleManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings); ActionManager <TArachnodeDAO> actionManager = new ActionManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager); CookieManager cookieManager = new CookieManager();; MemoryManager <TArachnodeDAO> memoryManager = new MemoryManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings); RuleManager <TArachnodeDAO> ruleManager = new RuleManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager); CacheManager <TArachnodeDAO> cacheManager = new CacheManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings); CrawlerPeerManager <TArachnodeDAO> crawlerPeerManager = new CrawlerPeerManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, null, arachnodeDAO); Cache <TArachnodeDAO> cache = new Cache <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, crawler, actionManager, cacheManager, crawlerPeerManager, memoryManager, ruleManager); DiscoveryManager <TArachnodeDAO> discoveryManager = new DiscoveryManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, cache, actionManager, cacheManager, memoryManager, ruleManager); HtmlManager <TArachnodeDAO> htmlManager = new HtmlManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, discoveryManager); //load the CrawlActions, CrawlRules and EngineActions... ruleManager.ProcessCrawlRules(crawler); actionManager.ProcessCrawlActions(crawler); actionManager.ProcessEngineActions(crawler); //these three methods are called in the Engine. UserDefinedFunctions.RefreshAllowedExtensions(true); UserDefinedFunctions.RefreshAllowedSchemes(true); UserDefinedFunctions.RefreshDisallowed(); //instantiate a WebClient to access the ResponseHeaders... WebClient <TArachnodeDAO> webClient = new WebClient <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager, cookieManager, new ProxyManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, consoleManager)); webClient.GetHttpWebResponse("http://google.com", "GET", null, null, null, null); WebPageManager <TArachnodeDAO> webPageManager = new WebPageManager <TArachnodeDAO>(crawler.ApplicationSettings, crawler.WebSettings, discoveryManager, htmlManager, arachnodeDAO); for (long i = webPageIDLowerBound; i <= webPageIDUpperBound; i++) { ArachnodeDataSet.WebPagesRow webPagesRow = null; try { //get the WebPage from the database. we need the source data as we don't store this in the index. //even though most of the fields are available in the Document, the WebPage is the authoritative source, so we'll use that for all of the fields. webPagesRow = arachnodeDAO.GetWebPage(i.ToString()); if (webPagesRow != null) { if (webPagesRow.Source == null || webPagesRow.Source.Length == 0) { if (File.Exists(discoveryManager.GetDiscoveryPath(crawler.ApplicationSettings.DownloadedWebPagesDirectory, webPagesRow.AbsoluteUri, webPagesRow.FullTextIndexType))) { using (StreamReader streamReader = File.OpenText(discoveryManager.GetDiscoveryPath(crawler.ApplicationSettings.DownloadedWebPagesDirectory, webPagesRow.AbsoluteUri, webPagesRow.FullTextIndexType))) { webPagesRow.Source = Encoding.UTF8.GetBytes(streamReader.ReadToEnd()); } } else { Console.WriteLine("WebPageID: " + i + " was NOT processed successfully."); if (OnWebPageProcessed != null) { OnWebPageProcessed.BeginInvoke(webPagesRow, "WebPageID: " + i + " was NOT processed successfully.", null, null); } } } ProcessWebPage(crawler.ApplicationSettings, crawler.WebSettings, crawler, webPagesRow, webClient, cache, actionManager, consoleManager, crawlerPeerManager, discoveryManager, memoryManager, ruleManager, webPageManager, arachnodeDAO); Console.WriteLine("WebPageID: " + i + " was processed successfully."); if (OnWebPageProcessed != null) { OnWebPageProcessed.BeginInvoke(webPagesRow, "WebPageID: " + i + " was processed successfully.", null, null); } } } catch (Exception exception) { Console.WriteLine("WebPageID: " + i + " was NOT processed successfully."); Console.WriteLine(exception.Message); if (OnWebPageProcessed != null) { OnWebPageProcessed.BeginInvoke(webPagesRow, "WebPageID: " + i + " was NOT processed successfully.", null, null); OnWebPageProcessed.BeginInvoke(webPagesRow, exception.Message, null, null); } arachnodeDAO.InsertException(null, null, exception, false); } } //stop the CrawlActions, CrawlRules and EngineActions... ruleManager.Stop(); actionManager.Stop(); }
/// <summary> /// The WebPageManager. /// </summary> /// <param name = "arachnodeDAO">Must be thread-safe.</param> protected AWebPageManager(ApplicationSettings applicationSettings, WebSettings webSettings, DiscoveryManager <TArachnodeDAO> discoveryManager, HtmlManager <TArachnodeDAO> htmlManager, IArachnodeDAO arachnodeDAO) : base(applicationSettings, webSettings) { _discoveryManager = discoveryManager; _htmlManager = htmlManager; _arachnodeDAO = arachnodeDAO; }
/// <summary> /// Processes a WebPagesRow after crawling. /// </summary> /// <param name = "webPagesRow">The web pages row.</param> /// <param name="webClient"></param> /// <param name="actionManager"></param> /// <param name="consoleManager"></param> /// <param name="discoveryManager"></param> /// <param name="memoryManager"></param> /// <param name="ruleManager"></param> /// <param name = "webPageManager">The web page manager.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> /// <param name = "fileManager">The file manager.</param> /// <param name = "imageManager">The image manager.</param> public static void ProcessWebPage(ApplicationSettings applicationSettings, WebSettings webSettings, Crawler <TArachnodeDAO> crawler, ArachnodeDataSet.WebPagesRow webPagesRow, WebClient <TArachnodeDAO> webClient, Cache <TArachnodeDAO> cache, ActionManager <TArachnodeDAO> actionManager, ConsoleManager <TArachnodeDAO> consoleManager, CrawlerPeerManager <TArachnodeDAO> crawlerPeerManager, DiscoveryManager <TArachnodeDAO> discoveryManager, MemoryManager <TArachnodeDAO> memoryManager, RuleManager <TArachnodeDAO> ruleManager, WebPageManager <TArachnodeDAO> webPageManager, IArachnodeDAO arachnodeDAO) { CacheManager <TArachnodeDAO> cacheManager = new CacheManager <TArachnodeDAO>(applicationSettings, webSettings); CookieManager cookieManager = new CookieManager(); CrawlRequestManager <TArachnodeDAO> crawlRequestManager = new CrawlRequestManager <TArachnodeDAO>(applicationSettings, webSettings, cache, consoleManager, discoveryManager); DataTypeManager <TArachnodeDAO> dataTypeManager = new DataTypeManager <TArachnodeDAO>(applicationSettings, webSettings); EncodingManager <TArachnodeDAO> encodingManager = new EncodingManager <TArachnodeDAO>(applicationSettings, webSettings); PolitenessManager <TArachnodeDAO> politenessManager = new PolitenessManager <TArachnodeDAO>(applicationSettings, webSettings, cache); ProxyManager <TArachnodeDAO> proxyManager = new ProxyManager <TArachnodeDAO>(applicationSettings, webSettings, consoleManager); HtmlManager <TArachnodeDAO> htmlManager = new HtmlManager <TArachnodeDAO>(applicationSettings, webSettings, discoveryManager); Crawl <TArachnodeDAO> crawl = new Crawl <TArachnodeDAO>(applicationSettings, webSettings, crawler, actionManager, consoleManager, cookieManager, crawlRequestManager, dataTypeManager, discoveryManager, encodingManager, htmlManager, politenessManager, proxyManager, ruleManager, true); //create a CrawlRequest as this is what the internals of SiteCrawler.dll expect to operate on... CrawlRequest <TArachnodeDAO> crawlRequest = new CrawlRequest <TArachnodeDAO>(new Discovery <TArachnodeDAO>(webPagesRow.AbsoluteUri), webPagesRow.CrawlDepth, UriClassificationType.Host, UriClassificationType.Host, 0, RenderType.None, RenderType.None); crawlRequest.Crawl = crawl; crawlRequest.Discovery.DiscoveryType = DiscoveryType.WebPage; crawlRequest.Discovery.ID = webPagesRow.ID; crawlRequest.Data = webPagesRow.Source; crawlRequest.CurrentDepth = webPagesRow.CrawlDepth; crawlRequest.Encoding = Encoding.GetEncoding(webPagesRow.CodePage); crawlRequest.ProcessData = true; crawlRequest.WebClient = webClient; crawlRequest.WebClient.HttpWebResponse.Headers.Clear(); //parse the ResponseHeaders from the WebPagesRow.ResponseHeaders string... foreach (string responseHeader in webPagesRow.ResponseHeaders.Split("\r\n".ToCharArray(), StringSplitOptions.RemoveEmptyEntries)) { string[] responseHeaderSplit = responseHeader.Split(":".ToCharArray()); string name = responseHeaderSplit[0]; string value = UserDefinedFunctions.ExtractResponseHeader(webPagesRow.ResponseHeaders, name, true).Value; crawlRequest.WebClient.HttpWebResponse.Headers.Add(name, value); } //refresh the DataTypes in the DataTypeManager... (if necessary)... if (dataTypeManager.AllowedDataTypes.Count == 0) { dataTypeManager.RefreshDataTypes(); } crawlRequest.DataType = dataTypeManager.DetermineDataType(crawlRequest); //now, process the bytes... encodingManager.ProcessCrawlRequest(crawlRequest, arachnodeDAO); if (applicationSettings.InsertWebPages) { crawlRequest.Discovery.ID = arachnodeDAO.InsertWebPage(crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.WebClient.HttpWebResponse.Headers.ToString(), applicationSettings.InsertWebPageSource ? crawlRequest.Data : new byte[] { }, crawlRequest.Encoding.CodePage, crawlRequest.DataType.FullTextIndexType, crawlRequest.CurrentDepth, applicationSettings.ClassifyAbsoluteUris); } crawlRequest.ManagedDiscovery = webPageManager.ManageWebPage(crawlRequest.Discovery.ID.Value, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.Data, crawlRequest.Encoding, crawlRequest.DataType.FullTextIndexType, applicationSettings.ExtractWebPageMetaData, applicationSettings.InsertWebPageMetaData, applicationSettings.SaveDiscoveredWebPagesToDisk); //assigning FileAndImageDiscoveries isn't applicable because Files and Images need to be crawled to be properly classified... without classification we don't know whether they belong in dbo.Files or dbo.Images... crawlRequestManager.ProcessEmailAddresses(crawlRequest, arachnodeDAO); crawlRequestManager.ProcessHyperLinks(crawlRequest, arachnodeDAO); actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PostRequest, arachnodeDAO); discoveryManager.CloseAndDisposeManagedDiscovery(crawlRequest, arachnodeDAO); }
/// <summary> /// Handles the Load event of the Page control. /// </summary> /// <param name = "sender">The source of the event.</param> /// <param name = "e">The <see cref = "System.EventArgs" /> instance containing the event data.</param> protected void Page_Load(object sender, EventArgs e) { if (Document != null) { string absoluteUri = Document.GetField("absoluteuri").StringValue(); string pageTitle = Document.GetField("title").StringValue(); string discoveryType = Document.GetField("discoverytype").StringValue(); string discoveryID = Document.GetField("discoveryid").StringValue(); uxHlTitle.NavigateUrl = Document.GetField("absoluteuri").StringValue(); if (!string.IsNullOrEmpty(pageTitle)) { uxHlTitle.Text = pageTitle.Length > WebSettings.MaximumPageTitleLength ? pageTitle.Substring(0, WebSettings.MaximumPageTitleLength) + "..." : pageTitle; } else { if (!UserDefinedFunctions.ExtractFileName(absoluteUri).IsNull) { uxHlTitle.Text = UserDefinedFunctions.ExtractFileName(absoluteUri).Value.Length > WebSettings.MaximumPageTitleLength ? UserDefinedFunctions.ExtractFileName(absoluteUri).Value.Substring(0, WebSettings.MaximumPageTitleLength) + "..." : UserDefinedFunctions.ExtractFileName(absoluteUri).Value; } else { uxHlTitle.Text = absoluteUri; } } if (discoveryType != "image") { uxLblSummary.Text = Summary; uxImgImage.Visible = false; } else { uxLblSummary.Visible = false; uxImgImage.ImageUrl = HtmlManager.GetImageUrl(absoluteUri, Document.GetField("fulltextindextype").StringValue(), ArachnodeDAO); } uxLblAbsoluteUri.Text = absoluteUri; switch (discoveryType) { case "file": uxHlCached.NavigateUrl = HtmlManager.GetFileUrl(absoluteUri, Document.GetField("fulltextindextype").StringValue(), ArachnodeDAO); break; case "image": uxHlCached.NavigateUrl = HtmlManager.GetImageUrl(absoluteUri, Document.GetField("fulltextindextype").StringValue(), ArachnodeDAO); break; case "webpage": uxHlBrowse.Visible = true; uxHlBrowse.NavigateUrl = "/Browse.aspx?discoveryID=" + discoveryID + "&absoluteUri=" + HttpUtility.UrlEncode(absoluteUri); uxHlCached.NavigateUrl = "/Cached.aspx?discoveryID=" + discoveryID + "&absoluteUri=" + HttpUtility.UrlEncode(absoluteUri) + "&webPage=" + Encryption.EncryptRijndaelManaged(Document.GetField("discoverypath").StringValue()) + "&codePage=" + Document.GetField("codepage").StringValue() + "&fullTextIndexType=" + Document.GetField("fulltextindextype").StringValue(); break; } uxHlExplain.NavigateUrl = "/Explanation.aspx?query=" + Request.QueryString["query"] + "&absoluteUri=" + absoluteUri + "&documentID=" + Document.GetField("documentid").StringValue() + "&strength=" + Document.GetField("strength").StringValue(); double score; double.TryParse(Document.GetField("relevancyscore").StringValue(), out score); double strength; double.TryParse(Document.GetField("strength").StringValue(), out strength); uxLblScoreAndStrength.Text = "Score:" + Math.Round(score, 2) + " Strength:" + Math.Round(strength, 2) + " = Total:" + (score * strength); } }
public LoginPage(AuthService authService, HtmlManager htmlService) { _authService = authService; _htmlService = htmlService; InitializeComponent(); }
public static List <SeedMagnetSearchModel> SearchBtsow(string id) { List <SeedMagnetSearchModel> ret = new List <SeedMagnetSearchModel>(); try { var serachContent = "https://btsow.space/search/" + id; var htmlRet = HtmlManager.GetHtmlWebClient("https://btsow.space", serachContent, null, true); if (htmlRet.Success) { HtmlDocument htmlDocument = new HtmlDocument(); htmlDocument.LoadHtml(htmlRet.Content); string xpath = "//div[@class='row']"; HtmlNodeCollection nodes = htmlDocument.DocumentNode.SelectNodes(xpath); foreach (var node in nodes.Take(nodes.Count - 1)) { var text = node.ChildNodes[1].ChildNodes[1].InnerText.Trim(); var size = FileUtility.GetFileSizeFromString(node.ChildNodes[3].InnerText.Trim()); var date = node.ChildNodes[5].InnerText.Trim(); var a = node.ChildNodes[1].OuterHtml; var url = a.Substring(a.IndexOf("\"") + 1); url = url.Substring(0, url.IndexOf("\"")); SeedMagnetSearchModel temp = new SeedMagnetSearchModel { Title = text, Size = size, Date = DateTime.Parse(date), Url = url, Source = SearchSeedSiteEnum.Btsow }; ret.Add(temp); } foreach (var r in ret) { var subHtmlRet = HtmlManager.GetHtmlContentViaUrl(r.Url); if (subHtmlRet.Success) { htmlDocument = new HtmlDocument(); htmlDocument.LoadHtml(subHtmlRet.Content); xpath = "//textarea[@class='magnet-link hidden-xs']"; HtmlNode node = htmlDocument.DocumentNode.SelectSingleNode(xpath); if (node != null) { r.MagUrl = node.InnerText; } } } } } catch (Exception ee) { } return(ret.OrderByDescending(x => x.Size).ToList()); }
public JsonResult Add115Task(string mag) { CookieContainer cc = new CookieContainer(); bool ret = false; string msg = ""; foreach (var t in JsonConvert.DeserializeObject <List <CookieItem> >(ScanDataBaseManager.GetOneOneFiveCookie().OneOneFiveCookie)) { Cookie c = new Cookie(t.Name, t.Value, "/", "115.com"); cc.Add(c); } var split = mag.Split(new string[] { "magnet:?" }, StringSplitOptions.None).Where(x => !string.IsNullOrEmpty(x)); Dictionary <string, string> param = new Dictionary <string, string>(); if (split.Count() <= 1) { param.Add("url", mag); } else { int index = 0; foreach (var s in split) { param.Add(string.Format("url[{0}]", index), "magnet:?" + s); index++; } } param.Add("sign", ""); param.Add("uid", "340200422"); param.Add("time", DateTime.Now.ToFileTimeUtc() + ""); var returnStr = ""; if (split.Count() <= 1) { returnStr = HtmlManager.Post("https://115.com/web/lixian/?ct=lixian&ac=add_task_url", param, cc); } else { returnStr = HtmlManager.Post("https://115.com/web/lixian/?ct=lixian&ac=add_task_urls", param, cc); } if (!string.IsNullOrEmpty(returnStr)) { var data = Newtonsoft.Json.Linq.JObject.Parse(returnStr); bool.TryParse(data.Property("state").Value.ToString(), out ret); if (ret == false) { msg = data.Property("error_msg").Value.ToString(); } } if (string.IsNullOrEmpty(msg)) { msg = "下载成功"; } return(Json(new { status = ret, msg = msg }, JsonRequestBehavior.AllowGet)); }
public static List <SeedMagnetSearchModel> SearchSukebei(string id, CookieContainer cc = null) { //if (cc == null) //{ // var c = HtmlManager.GetCookies("https://sukebei.nyaa.si/"); // cc = new CookieContainer(); // cc.Add(c); //} List <SeedMagnetSearchModel> ret = new List <SeedMagnetSearchModel>(); try { //var serachContent = "https://sukebei.nyaa.pro/search/c_0_0_k_" + id; //var htmlRet = HtmlManager.GetHtmlWebClient("https://sukebei.nyaa.pro", serachContent, cc); var serachContent = "https://sukebei.nyaa.si?f=0&c=0_0&q=" + id; var htmlRet = HtmlManager.GetHtmlWebClient("https://sukebei.nyaa.si", serachContent, cc); if (htmlRet.Success) { HtmlDocument htmlDocument = new HtmlDocument(); htmlDocument.LoadHtml(htmlRet.Content); string xpath = "//tr"; HtmlNodeCollection nodes = htmlDocument.DocumentNode.SelectNodes(xpath); foreach (var node in nodes.Skip(1)) { var text = FileUtility.ReplaceInvalidChar(node.ChildNodes[3].InnerText.Trim()); var a = node.ChildNodes[5].OuterHtml; var size = node.ChildNodes[7].InnerText.Trim(); var date = node.ChildNodes[9].OuterHtml.Trim().Replace("<td class=\"text-center\" data-timestamp=\"", "").Replace("\"></td>", ""); //var complete = node.ChildNodes[15].InnerText.Trim(); var url = a.Substring(a.IndexOf("<a href=\"magnet:?xt") + 9); url = url.Substring(0, url.IndexOf("\"")); int seconds = 0; int.TryParse(date, out seconds); DateTime startTime = TimeZone.CurrentTimeZone.ToLocalTime(new System.DateTime(1970, 1, 1)); // 当地时区 DateTime dt = startTime.AddSeconds(seconds); SeedMagnetSearchModel temp = new SeedMagnetSearchModel { Title = text, Size = FileUtility.GetFileSizeFromString(size), Date = dt, Url = "", //CompleteCount = int.Parse(complete), MagUrl = url, Source = SearchSeedSiteEnum.Sukebei }; ret.Add(temp); } } } catch (Exception ee) { } return(ret.Where(x => x.Size >= 0).OrderByDescending(x => x.CompleteCount).ThenByDescending(x => x.Size).ToList()); }
private static void ScanCategoryPageUrlSingleThread(Dictionary <string, string> urls) { int index = 1; foreach (var url in urls) { int retry = 1; var htmlRes = new Utils.HtmlResponse(); //如果取不到cookie最多重试5次 while (retry <= 5) { htmlRes = HtmlManager.GetHtmlWebClientWithRenewCC("http://www.javlibrary.com/cn/", url.Key, cc); if (htmlRes.IsExpire) { GetJavCookie(); retry++; continue; } else { break; } } if (htmlRes.Success) { HtmlAgilityPack.HtmlDocument htmlDocument = new HtmlAgilityPack.HtmlDocument(); htmlDocument.LoadHtml(htmlRes.Content); var videoPath = "//div[@class='video']"; var videoNodes = htmlDocument.DocumentNode.SelectNodes(videoPath); if (videoNodes != null) { int unScanCount = 0; foreach (var node in videoNodes) { var urlAndTitle = node.ChildNodes[0]; if (urlAndTitle != null && urlAndTitle.ChildNodes.Count >= 3) { var id = urlAndTitle.ChildNodes[0].InnerText.Trim(); var name = FileUtility.ReplaceInvalidChar(urlAndTitle.ChildNodes[2].InnerText.Trim()); var avUrl = urlAndTitle.Attributes["href"].Value.Trim().Replace("./", "http://www.javlibrary.com/cn/"); if (!string.IsNullOrEmpty(avUrl) && !string.IsNullOrEmpty(name) && !string.IsNullOrWhiteSpace(id)) { ScanURL scan = new ScanURL { Category = url.Value, ID = id, IsDownload = false, Title = name, URL = avUrl }; if (!JavDataBaseManager.HasScan(scan)) { unScanCount++; JavDataBaseManager.InsertScanURL(scan); } } } } Console.WriteLine(url.Value + " 第 " + index + " / " + urls.Count + " 页, 加入" + unScanCount + " 条未扫描AV"); index++; } } else { Console.WriteLine("获取列表页 " + url.Key + " 内容失败"); } } }
public static AV GetJavBusSearchDetail(string url, CookieContainer cc, Dictionary <string, string> mapping) { AV av = new AV(); var listHtml = HtmlManager.GetHtmlContentViaUrl(url, "utf-8", false, cc); if (listHtml.Success) { var titleTemplate = "<h3>(.*?)</h3>"; var imgTemplate = "<a class=\"bigImage\" href=\"(.*?)\">"; var idTemplate = "<span style=\"color:#CC0000;\">(.*?)</span>"; var dateTemplate = "<p><span class=\"header\">發行日期:</span>(.*?)</p>"; var directorTemplate = "<p><span class=\"header\">導演:</span> <a href=\"(.*?)\">(.*?)</a></p>"; var lengthTemplate = "<p><span class=\"header\">長度:</span>(.*?)分鐘</p>"; var actressTemplate = "<div class=\"star-name\"><a href=\"(.*?)\" title=\"(.*?)\">(.*?)</a></div>"; var companyTemplate = "<p><span class=\"header\">製作商:</span> <a href=\"(.*?)\">(.*?)</a>"; var publisherTemplate = "<p><span class=\"header\">發行商:</span> <a href=\"(.*?)\">(.*?)</a>"; var categotyTemplate = "<span class=\"genre\"><a href=\"(.*?)\">(.*?)</a></span>"; var mTitle = Regex.Match(listHtml.Content, titleTemplate); var mId = Regex.Match(listHtml.Content, idTemplate); var mImg = Regex.Match(listHtml.Content, imgTemplate); var mDate = Regex.Match(listHtml.Content, dateTemplate); var mLength = Regex.Match(listHtml.Content, lengthTemplate); var mDirector = Regex.Matches(listHtml.Content, directorTemplate); var mActress = Regex.Matches(listHtml.Content, actressTemplate); var mCompany = Regex.Matches(listHtml.Content, companyTemplate); var mPublisher = Regex.Matches(listHtml.Content, publisherTemplate); var mCategory = Regex.Matches(listHtml.Content, categotyTemplate); var id = mId.Groups[1]; var title = mTitle.Groups[1].ToString().Replace(id.ToString(), "").Trim(); var img = mImg.Groups[1]; var date = mDate.Groups[1]; var length = mLength.Groups[1]; var director = ""; var actress = ""; var company = ""; var publisher = ""; var category = ""; foreach (System.Text.RegularExpressions.Match d in mDirector) { director += d.Groups[2] + ","; } foreach (System.Text.RegularExpressions.Match d in mActress) { var act = d.Groups[3].ToString(); actress += act + ","; } foreach (System.Text.RegularExpressions.Match d in mCompany) { company += d.Groups[2] + ","; } foreach (System.Text.RegularExpressions.Match d in mPublisher) { publisher += d.Groups[2] + ","; } foreach (System.Text.RegularExpressions.Match d in mCategory) { category += d.Groups[2] + ","; } DateTime parse = new DateTime(2050, 1, 1); av.Name = title; av.ID = id.ToString(); av.PictureURL = img.ToString(); av.Publisher = publisher; DateTime.TryParse(date.ToString(), out parse); av.ReleaseDate = parse; av.Director = director; av.Actress = actress; av.Company = company; av.Category = category; av.AvLength = int.Parse(length.ToString()); return(av); } return(null); }
private static MangaCategoryListVM GetManageCategoryListHanhan(string[] category, int page) { MangaCategoryListVM ret = new MangaCategoryListVM(); ret.Mangas = new List <MangaCategoryListItem>(); ret.CurrentPage = page; var urlPrefix = "http://www.hanhande.net/"; var url = "list"; var searchUrl = ""; if (category.Length == 1 && string.IsNullOrEmpty(category[0])) { url += "_" + page + "/"; } else { url += "/" + string.Join("-", category) + "/" + page + "/"; } searchUrl = urlPrefix + url; var htmlRet = HtmlManager.GetHtmlWebClient("http://www.hanhan.net", searchUrl); if (htmlRet.Success) { try { HtmlDocument document = new HtmlDocument(); document.LoadHtml(htmlRet.Content); var listPath = "//li[@class='item-lg']"; var pagePath = "//li[@class='last']"; var listNodes = document.DocumentNode.SelectNodes(listPath); var pageNode = document.DocumentNode.SelectSingleNode(pagePath); if (pageNode != null && pageNode.ChildNodes.FindFirst("a") != null) { int total = -1; int.TryParse(pageNode.ChildNodes.FindFirst("a").Attributes["data-page"].Value.Trim(), out total); ret.TotalPage = total + 1; } if (listNodes != null) { foreach (var node in listNodes) { var aTag = node.ChildNodes.FindFirst("a"); if (aTag != null) { MangaCategoryListItem temp = new MangaCategoryListItem(); temp.MangaUrl = aTag.Attributes["href"].Value.Trim(); temp.MangaName = aTag.Attributes["title"].Value.Trim(); foreach (var subNode in aTag.ChildNodes) { if (subNode.Name == "img") { temp.PicUrl = subNode.Attributes["src"].Value.Trim(); } if (subNode.Name == "span" && subNode.Attributes["class"].Value.Trim() == "fd") { temp.IsFinished = true; } if (subNode.Name == "span" && subNode.Attributes["class"].Value.Trim() == "tt") { temp.UpdateInfo = subNode.InnerHtml.Trim(); } } foreach (var subNode in node.ChildNodes) { DateTime updateDate = DateTime.Now; if (subNode.Name == "span" && subNode.Attributes["class"].Value.Trim() == "updateon") { DateTime.TryParse(subNode.InnerHtml.Substring(subNode.InnerHtml.IndexOf(":") + 1, subNode.InnerHtml.IndexOf(" ")), out updateDate); temp.UpdateDate = updateDate; } } ret.Mangas.Add(temp); } } } } catch (Exception e) { ret.MsgCode = VMCode.Exception; ret.Msg = e.ToString(); } } else { ret.MsgCode = VMCode.Error; ret.Msg = "网页获取失败"; } ret.PageSize = ret.Mangas.Count; return(ret); }