Exemple #1
0
        public msg clone(object input)
        {
            msg m = Serializer.DeepClone <msg>(this);

            m.Input = input;
            return(m);
        }
Exemple #2
0
 public threadMsg(IAPI api, EventHandler <threadMsgEventArgs> on_message = null)
 {
     onMessageComplete = on_message;
     _api         = api;
     _resetEvent  = new ManualResetEvent(false);
     _threadEvent = new ManualResetEvent(false);
     _thread      = new Thread(new ParameterizedThreadStart(delegate(object evt)
     {
         api.Init();
         api.Open = true;
         app.postToAPI(_API.MEDIA, _API.MEDIA_KEY_INITED, null);
         threadMsgPara tm = (threadMsgPara)evt;
         while (_exit == false)
         {
             tm.ResetEvent.WaitOne();
             if (_exit)
             {
                 break;
             }
             else
             {
                 msg m = api.Execute(_msg);
                 //if (onMessageComplete != null) onMessageComplete.Invoke(this, new threadMsgEventArgs(m));
             }
             tm.ResetEvent.Reset();
         }
     }));
     _thread.Start(new threadMsgPara(_resetEvent));
 }
Exemple #3
0
 public void response_toMainRuntime(msg m)
 {
     if (fom == null)
     {
         fom = app.get_Main();
     }
     if (fom != null)
     {
         fom.api_responseMsg(null, new threadMsgEventArgs(m));
     }
 }
Exemple #4
0
        msg f_CRAWLER_KEY_REQUEST_LINK(msg m)
        {
            if (CRAWLER_KEY_STOP)
            {
                f_CRAWLER_KEY_STOP_reset();
            }

            string[] urls   = new string[] { };
            string[] uri_ok = dicHtml.Keys.ToArray();

            listUrl.Truncate(x => !uri_ok.Any(o => o == x), true);
            urls = listUrl.Take(crawlMaxThread);
            listUrl.Truncate(x => !urls.Any(o => o == x));

            Interlocked.Exchange(ref crawlPending, listUrl.Count);
            Interlocked.Exchange(ref crawlCounter, urls.Length);

            //if (Interlocked.CompareExchange(ref crawlPending, 0, 0) == 0)
            if (Interlocked.CompareExchange(ref crawlCounter, 0, 0) == 0)
            {
                string[] rs_out = dicHtml.Keys.ToArray();
                response_toMain(new msg()
                {
                    API = _API.CRAWLER, KEY = _API.CRAWLER_KEY_REQUEST_LINK, Log = "Crawle complete result: " + rs_out.Length + " links. Writing file ..."
                });
                if (Interlocked.CompareExchange(ref crawlResult, 1, 1) > 1)
                {
                    write_file_contentHTML();
                }
                response_toMain(new msg()
                {
                    API = _API.CRAWLER, KEY = _API.CRAWLER_KEY_REQUEST_LINK_COMPLETE, Input = rs_out
                });
                return(m);
            }
            else
            {
                for (int i = 0; i < urls.Length; i++)
                {
                    tasks[i].RunWorkerAsync(urls[i]);
                }
            }

            return(m);
        }
Exemple #5
0
 msg f_CRAWLER_KEY_STOP(msg m)
 {
     CRAWLER_KEY_STOP = true;
     string[] rs_out = dicHtml.Keys.ToArray();
     response_toMain(new msg()
     {
         API = _API.CRAWLER, KEY = _API.CRAWLER_KEY_REQUEST_LINK, Log = "Crawle complete result: " + rs_out.Length + " links. Writing file ..."
     });
     if (Interlocked.CompareExchange(ref crawlResult, 1, 1) > 1)
     {
         write_file_contentHTML();
     }
     response_toMain(new msg()
     {
         API = _API.CRAWLER, KEY = _API.CRAWLER_KEY_REQUEST_LINK_COMPLETE, Input = rs_out
     });
     return(m);
 }
Exemple #6
0
        public api_base()
        {
            if (timer_api == null)
            {
                timer_api = new System.Threading.Timer(new System.Threading.TimerCallback((obj) =>
                {
                    if (fom == null)
                    {
                        fom = app.get_Main();
                    }
                    if (cache_api.Count > 0)
                    {
                        msg m = cache_api.Dequeue();
                        if (fom != null && m != null)
                        {
                            fom.api_responseMsg(null, new threadMsgEventArgs(m));
                        }
                    }
                }), fom, 100, 100);
            }

            if (timer_msg == null)
            {
                timer_msg = new System.Threading.Timer(new System.Threading.TimerCallback((obj) =>
                {
                    if (fom == null)
                    {
                        fom = app.get_Main();
                    }
                    if (cache_msg.Count > 0)
                    {
                        msg m = cache_msg.Dequeue();
                        if (fom != null && m != null)
                        {
                            fom.api_responseMsg(null, new threadMsgEventArgs(m));
                        }
                    }
                }), fom, 500, 500);
            }
        }
Exemple #7
0
        public msg Execute(msg m)
        {
            if (m == null)
            {
                return(m);
            }

            switch (m.KEY)
            {
            case _API.CRAWLER_KEY_STOP:
                f_CRAWLER_KEY_STOP(m);
                break;

            case _API.CRAWLER_KEY_REGISTER_PATH:
                f_CRAWLER_KEY_REGISTER_PATH(m);
                break;

            case _API.CRAWLER_KEY_REQUEST_LINK:
                f_CRAWLER_KEY_REQUEST_LINK(m);
                break;

            case _API.CRAWLER_KEY_CONVERT_PACKAGE_TO_HTML:
                #region
                ////path_package = (string)m.Input;
                ////if (!string.IsNullOrEmpty(path_package) && File.Exists(path_package))
                ////{
                ////    //var dicRaw = new Dictionary<string, string>();
                ////    //var dicCon = new Dictionary<string, string>();
                ////    //var list_XPath = new List<string>();

                ////    //using (var fileStream = File.OpenRead(path_package))
                ////    //    dicRaw = Serializer.Deserialize<Dictionary<string, string>>(fileStream);

                ////    ////foreach (var kv in dicRaw)
                ////    ////{
                ////    ////    string s = kv.Value;
                ////    ////    doc = new HtmlDocument();
                ////    ////    doc.LoadHtml(s);
                ////    ////    foreach (var h1 in doc.DocumentNode.SelectNodes("//h1"))
                ////    ////    {
                ////    ////        //d1.Add(kv.Key, h1.ParentNode.InnerText);
                ////    ////        //d2.Add(kv.Key, h1.ParentNode.ParentNode.InnerText);
                ////    ////        //d3.Add(kv.Key, h1.ParentNode.ParentNode.ParentNode.InnerText);
                ////    ////        list_XPath.Add(h1.XPath);
                ////    ////        break;
                ////    ////    }
                ////    ////}

                ////    //foreach (var kv in dicRaw)
                ////    //{
                ////    //    string s = kv.Value, si = string.Empty;
                ////    //    doc = new HtmlDocument();
                ////    //    doc.LoadHtml(s);
                ////    //    var ns = doc.DocumentNode.SelectNodes("/html[1]/body[1]/div[3]/article[1]/div[1]/div[1]/div[1]/div[1]/article[1]");
                ////    //    if (ns != null && ns.Count > 0)
                ////    //    {
                ////    //        si = ns[0].InnerHtml;
                ////    //        dicCon.Add(kv.Key, si);
                ////    //    }
                ////    //}

                ////    //using (var file = File.Create("crawler.htm.bin"))
                ////    //    Serializer.Serialize<Dictionary<string, string>>(file, dicCon);

                ////}
                #endregion
                break;

            case _API.CRAWLER_KEY_CONVERT_PACKAGE_TO_TEXT:
                #region
                ////path_package = (string)m.Input;
                ////if (!string.IsNullOrEmpty(path_package) && File.Exists(path_package))
                ////{
                ////    var dicRaw = new Dictionary<string, string>();
                ////    var dicText = new Dictionary<string, string>();

                ////    using (var fileStream = File.OpenRead(path_package))
                ////        dicRaw = Serializer.Deserialize<Dictionary<string, string>>(fileStream);

                ////    foreach (var kv in dicRaw)
                ////    {
                ////        string s = new htmlToText().ConvertHtml(kv.Value).Trim();
                ////        dicText.Add(kv.Key, s);
                ////    }

                ////    using (var file = File.Create("crawler.txt.bin"))
                ////        Serializer.Serialize<Dictionary<string, string>>(file, dicText);

                ////}
                #endregion
                break;
            }

            m.Output.Ok   = true;
            m.Output.Data = null;
            return(m);
        }
Exemple #8
0
        msg f_CRAWLER_KEY_REGISTER_PATH(msg m)
        {
            CRAWLER_KEY_STOP     = false;
            domain_current       = string.Empty;
            setting_URL_CONTIANS = string.Empty;
            setting_PARA1        = string.Empty;
            setting_PARA2        = string.Empty;

            if (m.Input != null)
            {
                Interlocked.Exchange(ref crawlResult, 0);

                oLinkSetting st       = (oLinkSetting)m.Input;
                string       para_url = st.Url;

                if (st.Settings != null && st.Settings.Count > 0)
                {
                    st.Settings.TryGetValue("URL_CONTIANS", out setting_URL_CONTIANS);
                    st.Settings.TryGetValue("PARA1", out setting_PARA1);
                    st.Settings.TryGetValue("PARA2", out setting_PARA2);
                }
                string[] a = para_url.Split('/');
                domain_current = a[2].ToLower();
                if (domain_current.StartsWith("www."))
                {
                    domain_current = domain_current.Substring(4);
                }
                if (a.Length > 3)
                {
                    url_sub_path_current = a[3];
                }

                dicHtml.Clear();
                listUrl.Clear();

                read_file_contentHTML();

                HttpWebRequest w = (HttpWebRequest)WebRequest.Create(new Uri(para_url));
                w.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36";
                w.BeginGetResponse(asyncResult =>
                {
                    HttpWebResponse rs = (HttpWebResponse)w.EndGetResponse(asyncResult); //add a break point here
                    string url         = rs.ResponseUri.ToString();
                    response_toMain(new msg()
                    {
                        API = _API.CRAWLER, KEY = _API.CRAWLER_KEY_REQUEST_LINK, Log = url
                    });

                    if (rs.StatusCode == HttpStatusCode.OK)
                    {
                        string htm      = string.Empty;
                        StreamReader sr = new StreamReader(rs.GetResponseStream(), Encoding.UTF8);
                        htm             = sr.ReadToEnd();
                        sr.Close();
                        rs.Close();
                        if (!string.IsNullOrEmpty(htm))
                        {
                            htm = HttpUtility.HtmlDecode(htm);
                            htm = format_HTML(htm);

                            if (!dicHtml.ContainsKey(url))
                            {
                                dicHtml.Add(url, htm);
                                Interlocked.Increment(ref crawlResult);
                            }

                            var us = get_Urls(url, htm);

                            if (CRAWLER_KEY_STOP)
                            {
                                f_CRAWLER_KEY_STOP_reset();
                                return;
                            }

                            if (us.Url_Html.Length > 0)
                            {
                                listUrl.AddRange(us.Url_Html);
                                Execute(new msg()
                                {
                                    API = _API.CRAWLER, KEY = _API.CRAWLER_KEY_REQUEST_LINK
                                });
                            }
                            else
                            {
                                Execute(new msg()
                                {
                                    API = _API.CRAWLER, KEY = _API.CRAWLER_KEY_REQUEST_LINK_COMPLETE, Input = dicHtml.Keys.ToArray()
                                });
                            }
                        }
                    }
                }, w);
            }

            return(m);
        }
Exemple #9
0
 public void Execute(msg msg)
 {
     _msg = msg;
     _resetEvent.Set();
 }
Exemple #10
0
 public threadMsgEventArgs(msg msg)
 {
     Message = msg;
 }
Exemple #11
0
 public void response_toMain(msg m)
 {
     cache_api.Enqueue(m);
 }
Exemple #12
0
 public void notification_toMain(msg m)
 {
     cache_msg.Enqueue(m);
 }