Esempio n. 1
0
        private void UpdateStep(int step, bool action, bool clear = false, byte insideThreadCount = 3)
        {
            if (clear)
                UrlsToAddList.Clear();

            #region step 1
            if (step == 1 && action)
            {
                object lockAdd = new Object();
                UrlsToAddList.Clear();

#if DEBUG
                insideThreadCount = 1;
#else
                insideThreadCount = ThreadCount;
#endif

                ParseRuleConnectionType type = NewParseRule.Connection;

                int minWidth = NewParseRule.MinImageWidth;
                int minHeight = NewParseRule.MinImageHeight;
                bool collectIMGTags = NewParseRule.CollectIMGTags;
                bool collectLINKTags = NewParseRule.CollectLINKTags;
                bool collectMETATags = NewParseRule.CollectMETATags;
                byte threadCount = this.ThreadCount;

                BackgroundWorker bw = new BackgroundWorker();
                bw.DoWork += (s, e) =>
                    {
                        Helpers.PercentageProgress progress = new Helpers.PercentageProgress();
                        progress.Change += (sP, eP) =>
                            {
                                bw.ReportProgress((int)eP.Value);
                            };

                        List<UrlResultWrapper> urlResultWrapper = new List<UrlResultWrapper>();
                        var urls = e.Argument as StringUrlWithResultWrapper[];

                        if (urls != null)
                            urls
                                .Where(item => item != null && !string.IsNullOrWhiteSpace(item.Value) && Helper.IsWellFormedUriString(item.Value, UriKind.Absolute))
                                .Select(sw => 
                                    new 
                                        { 
                                            item = sw,
                                            prgItem = progress.GetChild() 
                                        })
                                .ToArray()
                                .AsParallel()
                                .WithDegreeOfParallelism(insideThreadCount)
                                .ForAll(
                                    (sw) =>
                                    {
                                        var item = new UrlResultWrapper() { Value = sw.item.Value };

                                        System.Drawing.Size minSize = new System.Drawing.Size() { Width = minWidth, Height = minHeight };

                                        var result = Helper.GetAllImagesFromUrl(item.Value, minSize, collectIMGTags, collectLINKTags, collectMETATags, threadCount, sw.prgItem, true, type);

                                        foreach (ParseImageResult res in result)
                                            item.ParseResult.Add(res);

                                        if (item.ParseResult.Count > 0)
                                            lock (lockAdd)
                                            {
                                                urlResultWrapper.Add(item);
                                            }
                                    });

                        e.Result = urlResultWrapper;
                    };
                bw.RunWorkerCompleted += (s, e) =>
                    {
                        if (e.Error != null)
                            throw e.Error;

                        try
                        {
                            List<UrlResultWrapper> urlResultWrapper = e.Result as List<UrlResultWrapper>;
                            foreach (var item in urlResultWrapper)
                            {
                                if (item.ParseResult != null)
                                    foreach (var ps in item.ParseResult)
                                        ps.IsSelected = (item.ParseResult.IndexOf(ps) == 0);
                                
                                UrlsToAddList.Add(item);
                            }
                        }
                        finally
                        {
                            bw.Dispose();
                            IsBusy = false;
                        }
                    };
                bw.WorkerReportsProgress = true;
                bw.ProgressChanged += (s, e) =>
                    {
                        LoadedPercent = e.ProgressPercentage;
                    };
                IsBusy = true;
                bw.RunWorkerAsync(Urls.ToArray());
                while (bw.IsBusy)
                    Helper.DoEvents();
                bw = null;
            }
#endregion
#region step 2
            else if (step == 2 && action)
            {
                HtmlNodeWithUrl[] nodes =
                    UrlsToAddList
                    .Where(n => !string.IsNullOrWhiteSpace(n.Value))
                    .Select(i => 
                        {
                            ParseImageResult res = i.ParseResult.Where(i2 => i2.IsSelected).FirstOrDefault();
                            return
                                new HtmlNodeWithUrl()
                                {
                                    Node = res == null ? null : res.Node,
                                    Url = res == null ? new Uri(i.Value, UriKind.RelativeOrAbsolute) : res.Url
                                };
                        }
                    )
                    .Where(i3 => i3 != null && i3.Node != null)
                    .ToArray();

                ParseRule newRule = Helper.GetRule(nodes, NewParseRule.Label, NewParseRule.MinImageSize, NewParseRule.CollectIMGTags, NewParseRule.CollectLINKTags, NewParseRule.CollectMETATags);
                newRule.CopyObject(NewParseRule, new string[] { "Connection" });

                ShowRuleModeCommand.Execute(null);
            }
#endregion

            if (step >= UrlsToAddTabControl.Items.Count)
                for (int i = UrlsToAddTabControl.Items.Count - 1; i >= 0; i--)
                    (UrlsToAddTabControl.Items[i] as TabItem).Visibility = (i == UrlsToAddTabControl.Items.Count - 1) ? System.Windows.Visibility.Visible : System.Windows.Visibility.Collapsed;
            else if (step < 0)
                for (int i = UrlsToAddTabControl.Items.Count - 1; i >= 0; i--)
                    (UrlsToAddTabControl.Items[i] as TabItem).Visibility = (i == 0) ? System.Windows.Visibility.Visible : System.Windows.Visibility.Collapsed;
            else
            { 
                for (int i = UrlsToAddTabControl.Items.Count - 1; i >= 0; i--)
                    (UrlsToAddTabControl.Items[i] as TabItem).Visibility = (i == step) ? System.Windows.Visibility.Visible : System.Windows.Visibility.Collapsed;
            }

            UrlsToAddTabControl.SelectedIndex = UrlsToAddTabControl.Items.IndexOf(UrlsToAddTabControl.Items.Cast<TabItem>().FirstOrDefault(ti => ti.Visibility == System.Windows.Visibility.Visible));
        }
Esempio n. 2
0
 internal static void GetFile(ParseRuleConnectionType type, Uri fileUrl, string tempFileName)
 {
     using (WebClient wc = new WebClient())
     {
         wc.Headers.Add("User-Agent: Other");
         wc.DownloadFile(fileUrl, tempFileName);
     }
 }
Esempio n. 3
0
        public static ParseImageResult[] GetAllImagesFromUrl(
            string url,
            System.Drawing.Size minSize, 
            bool collectIMGTags,
            bool collectLINKTags,
            bool collectMETATags,
            int threadCount = 6, 
            Helpers.PercentageProgress prgItem = null, 
            bool downloadImages = false, 
            ParseRuleConnectionType type = ParseRuleConnectionType.Direct)
        {
            List<ParseImageResult> result = new List<ParseImageResult>();
            try
            {
                Helpers.PercentageProgress prgItemPage = null;
                Helpers.PercentageProgress prgItemImg = null;
                if (prgItem != null)
                {
                    prgItemPage = prgItem.GetChild();
                    prgItemImg = prgItem.GetChild();
                }

                HtmlAgilityPack.HtmlDocument document = SiteManager.GetContent(url, type, out url);

                if (prgItemPage != null)
                    prgItemPage.Value = 100;

                //threadCount = 6;

                object lockAdd = new Object();

                var allLinks = GetAllImagesUrlsFromUrl(document, url, collectIMGTags, collectLINKTags, collectMETATags, null);
                int fullCnt = allLinks.Count();
                int currLoaded = 0;

                object currLoadedLock = new Object();

                //threadCount = 1;

                allLinks
                    .AsParallel()
                    .WithDegreeOfParallelism(threadCount)
                    .ForAll(node =>
                    {
                        Uri fileUrl = node.Url;// Helper.GetFullSourceLink(node.Url, document, url);
                        try
                        {
                            System.Drawing.Size imageSize;
                            if (Helper.CheckImageSize(fileUrl.AbsoluteUri, minSize, out imageSize, true, !downloadImages))
                            {
                                if (!imageSize.IsEmpty)
                                    SetImageSize(node.Node, imageSize);

                                string fileName = fileUrl.AbsolutePath.Split(new[] { '/' }).Last();
                                string tempFileName = System.IO.Path.GetTempPath() + Guid.NewGuid().ToString() + System.IO.Path.GetExtension(fileName);

                                if (downloadImages)
                                {
                                    SiteManager.GetFile(type, fileUrl, tempFileName);

                                    bool isWebPArchive = false;
                                    using (Stream inputStream = System.IO.File.Open(tempFileName, System.IO.FileMode.Open))
                                    {
                                        var webPcheck = new byte[4];
                                        int btsRead;
                                        if ((btsRead = inputStream.Read(webPcheck, 0, webPcheck.Length)) > 0)
                                        {
                                            var firstStr = Encoding.ASCII.GetString(webPcheck, 0, btsRead);
                                            if (firstStr.ToUpper() == "RIFF")
                                                isWebPArchive = true;
                                        }
                                    }

                                    if (isWebPArchive)
                                    {
                                        Imazen.WebP.Extern.LoadLibrary.LoadWebPOrFail();

                                        var decoder = new SimpleDecoder();
                                        var outFile = tempFileName + ".jpg";

                                        using (FileStream outStream = new FileStream(outFile, FileMode.Create))
                                        using (Stream inputStream = System.IO.File.Open(tempFileName, System.IO.FileMode.Open))
                                        {
                                            var bytes = ReadFully(inputStream);
                                            var outBitmap = decoder.DecodeFromBytes(bytes, bytes.LongLength);
                                            outBitmap.Save(outStream, System.Drawing.Imaging.ImageFormat.Jpeg);
                                            outStream.Close();
                                        }

                                        tempFileName = outFile;
                                    }
                                }

                                System.Drawing.Image image = downloadImages ? System.Drawing.Image.FromFile(tempFileName) : null;
                                try
                                {
                                    if (image != null)
                                    {
                                        imageSize = new System.Drawing.Size() { Height = image.Height, Width = image.Width };
                                        if (!imageSize.IsEmpty)
                                            SetImageSize(node.Node, imageSize);
                                    }
                                    if (!downloadImages || Helper.CheckImageSize(imageSize, minSize, false))
                                        result.Add(new ParseImageResult(node.Node, image, imageSize, fileUrl));
                                }
                                finally
                                {
                                    if (!result.Any(r => r.Url == fileUrl) && image != null)
                                        image.Dispose();

                                    if (downloadImages)
                                        try { System.IO.File.Delete(tempFileName); }
                                        catch { }
                                }
                            }
                        }
                        catch (Exception ex)
                        {
                            Helpers.Old.Log.Add(ex, string.Format("Helper.GetAllImagesFromUrl(url:'{0}',..,type:'{1}').ForAllThread(fileUrl:'{2}',..)", url, type, fileUrl.AbsoluteUri));
                        }
                        finally
                        {
                            if (prgItemImg != null)
                                lock (currLoadedLock)
                                {
                                    currLoaded++;
                                    prgItemImg.Value = ((decimal)currLoaded / (decimal)fullCnt) * 100m;
                                }
                        }
                    }
                );
                if (prgItemImg != null)
                    prgItemImg.Value = 100;
            }
            catch (Exception ex)
            {
                Helpers.Old.Log.Add(ex, string.Format("Helper.GetAllImagesFromUrl(url:'{0}',..,type:'{1}')", url, type));
            }

            if (prgItem != null && prgItem.Value != 100)
                prgItem.Value = 100;

            result.RemoveAll(i => result.AsParallel().Count(r => r.Url.AbsoluteUri == i.Url.AbsoluteUri) > 1);
            return result.OrderBy(i => i.Url.ToString()).OrderByDescending(i => i.ImageSize.Width * i.ImageSize.Height).ToArray();
        }
Esempio n. 4
0
        public static HtmlAgilityPack.HtmlDocument GetContent(string url, ParseRuleConnectionType type, out string urlResponse)
        {
            HtmlAgilityPack.HtmlDocument document = null;
            urlResponse = url;

            if (IsInited)
            {
                if (type == ParseRuleConnectionType.Direct)
                {
                    HtmlWeb htmlWeb = new HtmlWeb() { AutoDetectEncoding = true, UserAgent = "Other" };
                    document = htmlWeb.Load(url);
                    if (document.StreamEncoding != document.Encoding)
                    {
                        htmlWeb.AutoDetectEncoding = false;
                        htmlWeb.OverrideEncoding = document.Encoding;
                        document = htmlWeb.Load(url);
                    }
                    urlResponse = htmlWeb.ResponseUri.AbsoluteUri;
                }
                else if (new ParseRuleConnectionType[] { ParseRuleConnectionType.IE_00_sec, ParseRuleConnectionType.IE_05_sec, ParseRuleConnectionType.IE_10_sec }.Contains(type))
                {
                    string waitSeconds = type.GetType().GetEnumName(type);
                    waitSeconds = waitSeconds.Substring(0, waitSeconds.LastIndexOf("_"));
                    waitSeconds = waitSeconds.Substring(waitSeconds.IndexOf("_") + 1);
                    int wait;
                    if (int.TryParse(waitSeconds, out wait))
                    {
                        SiteManagerIE mgr = new SiteManagerIE();
                        var res = mgr.Navigate(new Uri(url), wait);
                        document = new HtmlDocument();
                        document.LoadHtml(res.Content);
                        urlResponse = res.ResponseUri.AbsoluteUri;
                    }
                }
                else if (new ParseRuleConnectionType[] { ParseRuleConnectionType.CHR_00_sec, ParseRuleConnectionType.CHR_05_sec, ParseRuleConnectionType.CHR_10_sec }.Contains(type))
                {
                    string waitSeconds = type.GetType().GetEnumName(type);
                    waitSeconds = waitSeconds.Substring(0, waitSeconds.LastIndexOf("_"));
                    waitSeconds = waitSeconds.Substring(waitSeconds.IndexOf("_") + 1);
                    int wait;
                    if (int.TryParse(waitSeconds, out wait))
                    {
                        SiteManagerCHR mgr = new SiteManagerCHR();
                        var res = mgr.Navigate(new Uri(url), wait * 1000);
                        document = new HtmlDocument();
                        document.LoadHtml(res.Content ?? string.Empty);
                        urlResponse = res.ResponseUri.AbsoluteUri;
                    }
                }
            }
            else
                throw new Exception("SiteManager not inited. Use SiteManager.Init() to initialize components");

            return document;
        }