private void UpdateStep(int step, bool action, bool clear = false, byte insideThreadCount = 3) { if (clear) UrlsToAddList.Clear(); #region step 1 if (step == 1 && action) { object lockAdd = new Object(); UrlsToAddList.Clear(); #if DEBUG insideThreadCount = 1; #else insideThreadCount = ThreadCount; #endif ParseRuleConnectionType type = NewParseRule.Connection; int minWidth = NewParseRule.MinImageWidth; int minHeight = NewParseRule.MinImageHeight; bool collectIMGTags = NewParseRule.CollectIMGTags; bool collectLINKTags = NewParseRule.CollectLINKTags; bool collectMETATags = NewParseRule.CollectMETATags; byte threadCount = this.ThreadCount; BackgroundWorker bw = new BackgroundWorker(); bw.DoWork += (s, e) => { Helpers.PercentageProgress progress = new Helpers.PercentageProgress(); progress.Change += (sP, eP) => { bw.ReportProgress((int)eP.Value); }; List<UrlResultWrapper> urlResultWrapper = new List<UrlResultWrapper>(); var urls = e.Argument as StringUrlWithResultWrapper[]; if (urls != null) urls .Where(item => item != null && !string.IsNullOrWhiteSpace(item.Value) && Helper.IsWellFormedUriString(item.Value, UriKind.Absolute)) .Select(sw => new { item = sw, prgItem = progress.GetChild() }) .ToArray() .AsParallel() .WithDegreeOfParallelism(insideThreadCount) .ForAll( (sw) => { var item = new UrlResultWrapper() { Value = sw.item.Value }; System.Drawing.Size minSize = new System.Drawing.Size() { Width = minWidth, Height = minHeight }; var result = Helper.GetAllImagesFromUrl(item.Value, minSize, collectIMGTags, collectLINKTags, collectMETATags, threadCount, sw.prgItem, true, type); foreach (ParseImageResult res in result) item.ParseResult.Add(res); if (item.ParseResult.Count > 0) lock (lockAdd) { urlResultWrapper.Add(item); } }); e.Result = urlResultWrapper; }; bw.RunWorkerCompleted += (s, e) => { if (e.Error != null) throw e.Error; try { List<UrlResultWrapper> urlResultWrapper = e.Result as List<UrlResultWrapper>; foreach (var item in urlResultWrapper) { if (item.ParseResult != null) foreach (var ps in item.ParseResult) ps.IsSelected = (item.ParseResult.IndexOf(ps) == 0); UrlsToAddList.Add(item); } } finally { bw.Dispose(); IsBusy = false; } }; bw.WorkerReportsProgress = true; bw.ProgressChanged += (s, e) => { LoadedPercent = e.ProgressPercentage; }; IsBusy = true; bw.RunWorkerAsync(Urls.ToArray()); while (bw.IsBusy) Helper.DoEvents(); bw = null; } #endregion #region step 2 else if (step == 2 && action) { HtmlNodeWithUrl[] nodes = UrlsToAddList .Where(n => !string.IsNullOrWhiteSpace(n.Value)) .Select(i => { ParseImageResult res = i.ParseResult.Where(i2 => i2.IsSelected).FirstOrDefault(); return new HtmlNodeWithUrl() { Node = res == null ? null : res.Node, Url = res == null ? new Uri(i.Value, UriKind.RelativeOrAbsolute) : res.Url }; } ) .Where(i3 => i3 != null && i3.Node != null) .ToArray(); ParseRule newRule = Helper.GetRule(nodes, NewParseRule.Label, NewParseRule.MinImageSize, NewParseRule.CollectIMGTags, NewParseRule.CollectLINKTags, NewParseRule.CollectMETATags); newRule.CopyObject(NewParseRule, new string[] { "Connection" }); ShowRuleModeCommand.Execute(null); } #endregion if (step >= UrlsToAddTabControl.Items.Count) for (int i = UrlsToAddTabControl.Items.Count - 1; i >= 0; i--) (UrlsToAddTabControl.Items[i] as TabItem).Visibility = (i == UrlsToAddTabControl.Items.Count - 1) ? System.Windows.Visibility.Visible : System.Windows.Visibility.Collapsed; else if (step < 0) for (int i = UrlsToAddTabControl.Items.Count - 1; i >= 0; i--) (UrlsToAddTabControl.Items[i] as TabItem).Visibility = (i == 0) ? System.Windows.Visibility.Visible : System.Windows.Visibility.Collapsed; else { for (int i = UrlsToAddTabControl.Items.Count - 1; i >= 0; i--) (UrlsToAddTabControl.Items[i] as TabItem).Visibility = (i == step) ? System.Windows.Visibility.Visible : System.Windows.Visibility.Collapsed; } UrlsToAddTabControl.SelectedIndex = UrlsToAddTabControl.Items.IndexOf(UrlsToAddTabControl.Items.Cast<TabItem>().FirstOrDefault(ti => ti.Visibility == System.Windows.Visibility.Visible)); }
internal static void GetFile(ParseRuleConnectionType type, Uri fileUrl, string tempFileName) { using (WebClient wc = new WebClient()) { wc.Headers.Add("User-Agent: Other"); wc.DownloadFile(fileUrl, tempFileName); } }
public static ParseImageResult[] GetAllImagesFromUrl( string url, System.Drawing.Size minSize, bool collectIMGTags, bool collectLINKTags, bool collectMETATags, int threadCount = 6, Helpers.PercentageProgress prgItem = null, bool downloadImages = false, ParseRuleConnectionType type = ParseRuleConnectionType.Direct) { List<ParseImageResult> result = new List<ParseImageResult>(); try { Helpers.PercentageProgress prgItemPage = null; Helpers.PercentageProgress prgItemImg = null; if (prgItem != null) { prgItemPage = prgItem.GetChild(); prgItemImg = prgItem.GetChild(); } HtmlAgilityPack.HtmlDocument document = SiteManager.GetContent(url, type, out url); if (prgItemPage != null) prgItemPage.Value = 100; //threadCount = 6; object lockAdd = new Object(); var allLinks = GetAllImagesUrlsFromUrl(document, url, collectIMGTags, collectLINKTags, collectMETATags, null); int fullCnt = allLinks.Count(); int currLoaded = 0; object currLoadedLock = new Object(); //threadCount = 1; allLinks .AsParallel() .WithDegreeOfParallelism(threadCount) .ForAll(node => { Uri fileUrl = node.Url;// Helper.GetFullSourceLink(node.Url, document, url); try { System.Drawing.Size imageSize; if (Helper.CheckImageSize(fileUrl.AbsoluteUri, minSize, out imageSize, true, !downloadImages)) { if (!imageSize.IsEmpty) SetImageSize(node.Node, imageSize); string fileName = fileUrl.AbsolutePath.Split(new[] { '/' }).Last(); string tempFileName = System.IO.Path.GetTempPath() + Guid.NewGuid().ToString() + System.IO.Path.GetExtension(fileName); if (downloadImages) { SiteManager.GetFile(type, fileUrl, tempFileName); bool isWebPArchive = false; using (Stream inputStream = System.IO.File.Open(tempFileName, System.IO.FileMode.Open)) { var webPcheck = new byte[4]; int btsRead; if ((btsRead = inputStream.Read(webPcheck, 0, webPcheck.Length)) > 0) { var firstStr = Encoding.ASCII.GetString(webPcheck, 0, btsRead); if (firstStr.ToUpper() == "RIFF") isWebPArchive = true; } } if (isWebPArchive) { Imazen.WebP.Extern.LoadLibrary.LoadWebPOrFail(); var decoder = new SimpleDecoder(); var outFile = tempFileName + ".jpg"; using (FileStream outStream = new FileStream(outFile, FileMode.Create)) using (Stream inputStream = System.IO.File.Open(tempFileName, System.IO.FileMode.Open)) { var bytes = ReadFully(inputStream); var outBitmap = decoder.DecodeFromBytes(bytes, bytes.LongLength); outBitmap.Save(outStream, System.Drawing.Imaging.ImageFormat.Jpeg); outStream.Close(); } tempFileName = outFile; } } System.Drawing.Image image = downloadImages ? System.Drawing.Image.FromFile(tempFileName) : null; try { if (image != null) { imageSize = new System.Drawing.Size() { Height = image.Height, Width = image.Width }; if (!imageSize.IsEmpty) SetImageSize(node.Node, imageSize); } if (!downloadImages || Helper.CheckImageSize(imageSize, minSize, false)) result.Add(new ParseImageResult(node.Node, image, imageSize, fileUrl)); } finally { if (!result.Any(r => r.Url == fileUrl) && image != null) image.Dispose(); if (downloadImages) try { System.IO.File.Delete(tempFileName); } catch { } } } } catch (Exception ex) { Helpers.Old.Log.Add(ex, string.Format("Helper.GetAllImagesFromUrl(url:'{0}',..,type:'{1}').ForAllThread(fileUrl:'{2}',..)", url, type, fileUrl.AbsoluteUri)); } finally { if (prgItemImg != null) lock (currLoadedLock) { currLoaded++; prgItemImg.Value = ((decimal)currLoaded / (decimal)fullCnt) * 100m; } } } ); if (prgItemImg != null) prgItemImg.Value = 100; } catch (Exception ex) { Helpers.Old.Log.Add(ex, string.Format("Helper.GetAllImagesFromUrl(url:'{0}',..,type:'{1}')", url, type)); } if (prgItem != null && prgItem.Value != 100) prgItem.Value = 100; result.RemoveAll(i => result.AsParallel().Count(r => r.Url.AbsoluteUri == i.Url.AbsoluteUri) > 1); return result.OrderBy(i => i.Url.ToString()).OrderByDescending(i => i.ImageSize.Width * i.ImageSize.Height).ToArray(); }
public static HtmlAgilityPack.HtmlDocument GetContent(string url, ParseRuleConnectionType type, out string urlResponse) { HtmlAgilityPack.HtmlDocument document = null; urlResponse = url; if (IsInited) { if (type == ParseRuleConnectionType.Direct) { HtmlWeb htmlWeb = new HtmlWeb() { AutoDetectEncoding = true, UserAgent = "Other" }; document = htmlWeb.Load(url); if (document.StreamEncoding != document.Encoding) { htmlWeb.AutoDetectEncoding = false; htmlWeb.OverrideEncoding = document.Encoding; document = htmlWeb.Load(url); } urlResponse = htmlWeb.ResponseUri.AbsoluteUri; } else if (new ParseRuleConnectionType[] { ParseRuleConnectionType.IE_00_sec, ParseRuleConnectionType.IE_05_sec, ParseRuleConnectionType.IE_10_sec }.Contains(type)) { string waitSeconds = type.GetType().GetEnumName(type); waitSeconds = waitSeconds.Substring(0, waitSeconds.LastIndexOf("_")); waitSeconds = waitSeconds.Substring(waitSeconds.IndexOf("_") + 1); int wait; if (int.TryParse(waitSeconds, out wait)) { SiteManagerIE mgr = new SiteManagerIE(); var res = mgr.Navigate(new Uri(url), wait); document = new HtmlDocument(); document.LoadHtml(res.Content); urlResponse = res.ResponseUri.AbsoluteUri; } } else if (new ParseRuleConnectionType[] { ParseRuleConnectionType.CHR_00_sec, ParseRuleConnectionType.CHR_05_sec, ParseRuleConnectionType.CHR_10_sec }.Contains(type)) { string waitSeconds = type.GetType().GetEnumName(type); waitSeconds = waitSeconds.Substring(0, waitSeconds.LastIndexOf("_")); waitSeconds = waitSeconds.Substring(waitSeconds.IndexOf("_") + 1); int wait; if (int.TryParse(waitSeconds, out wait)) { SiteManagerCHR mgr = new SiteManagerCHR(); var res = mgr.Navigate(new Uri(url), wait * 1000); document = new HtmlDocument(); document.LoadHtml(res.Content ?? string.Empty); urlResponse = res.ResponseUri.AbsoluteUri; } } } else throw new Exception("SiteManager not inited. Use SiteManager.Init() to initialize components"); return document; }