示例#1
0
        private void CreateSciencesList()
        {
            Scien scien2 = new Scien();

            if (scien2.ReadXml())
            {
                scienses = scien2.Fields.Sciences;
            }
            else
            {
                string archive = "https://arxiv.org/archive/";
                chromeDrv.ChromeGoToURL(archive);
                Dictionary <string, IWebElement> lstElem_a = chromeDrv.GetLstElem_a();

                List <Subject> subjects = new List <Subject>();

                foreach (KeyValuePair <string, IWebElement> el in lstElem_a)
                {
                    Subject subject = new Subject();
                    chromeDrv2.ChromeGoToURL(el.Value.GetAttribute("href"));
                    subject.Category = chromeDrv2.GetCategory(el.Key);
                    subject.Name     = el.Value.Text;
                    subjects.Add(subject);
                }

                scienses = new List <Science>();
                ScienceDictionary sc = new ScienceDictionary();

                foreach (Subject subj in subjects)
                {
                    Science science = null;
                    foreach (KeyValuePair <string, string> kv in sc.subjects)
                    {
                        if (subj.Name.Contains(kv.Key))
                        {
                            science      = new Science();
                            science.Name = kv.Value;
                            science.subjects.Add(subj);
                        }
                    }
                    scienses.Add(science);
                }

                Scien scien = new Scien();
                scien.Fields.Sciences = scienses;
                scien.WriteXml();
            }
        }
示例#2
0
        public void GetSubjects(object context)
        {
            SynchronizationContext contextSync = (SynchronizationContext)context;

            ArticleModel article = new ArticleModel();
            DataAccess   dal     = new DataAccess();

            int year  = 0;
            int month = 0;
            int index = 0;

            string y = "";
            string m = "";

            try
            {
                _browser = new ChromeDrv();
                _driver  = _browser.GetWebDriver();
            }
            catch (Exception e)
            {
                contextSync.Send(OnGetArticles_Exception, e.Message);
                _browser.ExitDriver();
                return;
            }

            Scien          scien2    = new Scien();
            List <Science> scienses2 = new List <Science>();

            if (scien2.ReadXml())
            {
                scienses2 = scien2.Fields.Sciences;
            }

            for (year = _yearStart; year <= _yearStop; year++)
            {
                for (month = _monthStart; month <= _monthStop; month++)
                {
                    for (index = _indexStart; ; index++)
                    {
                        try
                        {
                            article.Article_Source = string.Format("{0}{1:d2}{2:d2}.{3:d5}",
                                                                   _pathFirst, int.Parse(year.ToString().Substring(2, 2)), month, index);

                            contextSync.Send(OnGetSource, article.Article_Source);

                            _driver.Navigate().GoToUrl(article.Article_Source);

                            if (PageNotFound())
                            {
                                contextSync.Send(OnGetSource_Exception, "Page not found : " + article.Article_Source);
                                _indexStart = 1;
                                break;
                            }

                            article.Id = 0;

                            article.Category = _driver.FindElement(By.CssSelector("span.primary-subject")).Text;
                            bool flagReturn = false;

                            foreach (Science sc in scienses2)
                            {
                                foreach (Subject sb in sc.subjects)
                                {
                                    foreach (string cat in sb.Category)
                                    {
                                        Regex regexCat = new Regex(@"\S+\.\S+");
                                        Match matchCat = regexCat.Match(cat);

                                        if (matchCat.Value != "")
                                        {
                                            if (article.Category.Contains(matchCat.Value))
                                            {
                                                string catStr = cat.Substring(matchCat.Length + 2, cat.Length - matchCat.Length - 2);
                                                article.Category = catStr;
                                                article.Subject  = sb.Name;
                                                article.Science  = sc.Name;
                                                flagReturn       = true;
                                            }
                                        }
                                        if (flagReturn)
                                        {
                                            break;
                                        }
                                    }
                                    if (flagReturn)
                                    {
                                        break;
                                    }
                                }
                                if (flagReturn)
                                {
                                    break;
                                }
                            }

                            article.Title = _driver.FindElement(By.CssSelector("#abs > h1")).Text;
                            var lstElem = _driver.FindElements(By.CssSelector(".authors > a"));
                            article.Autors = "";
                            foreach (IWebElement elem in lstElem)
                            {
                                article.Autors += elem.Text + ", ";
                            }

                            string strDate = _driver.FindElement(By.CssSelector(".submission-history")).Text;
                            Regex  regex   = new Regex(@"\d{1,2}\s\D{3}\s\d{4}\s\d{2}\W\d{2}\W\d{2}");
                            Match  match   = regex.Match(strDate);
                            article.Publication_Date = DateTime.Parse(match.Value);
                            article.Quotation        = _driver.FindElement(By.CssSelector("#abs > blockquote")).Text;
                            article.Document_Source  = _driver.FindElement(By.CssSelector("div.full-text > ul > li > a")).GetAttribute("href") + ".pdf";
                            regex = new Regex(@"(pdf)\/\d{4}");
                            match = regex.Match(article.Document_Source);

                            if (match.Value != "")
                            {
                                y = match.Value.Substring(4, 2);
                                m = match.Value.Substring(6, 2);
                                DirectoryInfo dir = new DirectoryInfo(_pathPDF + @"\" + y + @"\" + m);
                                if (!dir.Exists && _pathPDF != string.Empty)
                                {
                                    dir.Create();
                                }

                                Regex  regexFile      = new Regex(@"\d{4}\W\d{4,5}\W(pdf)");
                                Match  matchFile      = regexFile.Match(article.Document_Source);
                                string document_Local = _pathPDF + @"\" + y + @"\" + m + @"\" + matchFile.Value;
                                article.Document_Local = @"\" + y + @"\" + m + @"\" + matchFile.Value;

                                if (_flagPDF)
                                {
                                    try
                                    {
                                        using (var client = new WebClient())
                                        {
                                            client.Headers.Add("User-Agent: Other");
                                            client.DownloadFile(article.Document_Source, document_Local);
                                        }
                                    }
                                    catch (Exception e)
                                    {
                                        contextSync.Send(OnGetSource_Exception, e.Message);
                                        index = --index;
                                        if (_cancel)
                                        {
                                            break;
                                        }
                                        continue;
                                    }
                                }
                            }
                            else
                            {
                                article.Document_Local  = "";
                                article.Document_Source = "";
                            }

                            article.Recording_Date = DateTime.Now;
                        }
                        catch (Exception e)
                        {
                            contextSync.Send(OnGetSource_Exception, e.Message);
                            Thread.Sleep(10000);
                            index = --index;
                            if (_cancel)
                            {
                                break;
                            }
                            continue;
                        }

                        if (_flagArticles)
                        {
                            try
                            {
                                dal.WriteDBSQL(article);
                            }
                            catch (Exception e)
                            {
                                contextSync.Send(OnGetArticles_Exception, e.Message);
                                _cancel = true;
                                index   = --index;
                                break;
                            }
                        }
                        if (_cancel)
                        {
                            break;
                        }
                    }
                    if (_cancel)
                    {
                        break;
                    }
                    if (month == 12)
                    {
                        _monthStart = 1;
                    }
                }
                if (_cancel)
                {
                    break;
                }
            }

            _yearStart  = year;
            _monthStart = month;
            _indexStart = ++index;

            lstSettings = new List <int>();
            lstSettings.Add(_yearStart);
            lstSettings.Add(_monthStart);
            lstSettings.Add(_indexStart);

            contextSync.Send(OnGetSettings, lstSettings);
            contextSync.Send(OnGetArticles_Cancel, true);
        }